use of scala.Tuple2 in project gatk by broadinstitute.
the class ReadsForQNamesFinder method call.
public Iterable<Tuple2<Integer, List<SVFastqUtils.FastqRead>>> call(final Iterator<GATKRead> readsItr) {
@SuppressWarnings({ "unchecked", "rawtypes" }) final List<SVFastqUtils.FastqRead>[] intervalReads = new List[nIntervals];
int nPopulatedIntervals = 0;
while (readsItr.hasNext()) {
final GATKRead read = readsItr.next();
final Iterator<QNameAndInterval> namesItr = qNamesMultiMap.findEach(read.getName());
SVFastqUtils.FastqRead FastqRead = null;
while (namesItr.hasNext()) {
final int intervalId = namesItr.next().getIntervalId();
if (intervalReads[intervalId] == null) {
intervalReads[intervalId] = new ArrayList<>(nReadsPerInterval);
nPopulatedIntervals += 1;
}
if (FastqRead == null) {
final String readName = dumpFASTQs ? SVFastqUtils.readToFastqSeqId(read, includeMappingLocation) : null;
FastqRead = new SVFastqUtils.FastqRead(readName, read.getBases(), read.getBaseQualities());
}
intervalReads[intervalId].add(FastqRead);
}
}
final List<Tuple2<Integer, List<SVFastqUtils.FastqRead>>> fastQRecords = new ArrayList<>(nPopulatedIntervals);
if (nPopulatedIntervals > 0) {
for (int idx = 0; idx != nIntervals; ++idx) {
final List<SVFastqUtils.FastqRead> readList = intervalReads[idx];
if (readList != null)
fastQRecords.add(new Tuple2<>(idx, readList));
}
}
return fastQRecords;
}
use of scala.Tuple2 in project gatk by broadinstitute.
the class RunSGAViaProcessBuilderOnSpark method writeToLocal.
/**
* Utility function that unloads the FASTQ contents for a breakpoint to a local file for later consumption by SGA.
* @param oneBreakPoint input for one breakpoint, where the first is the path to the FASTQ file and the second is the FASTQ file's content
* @return the breakpoint ID and with the FASTQ file contents dumped to a local File
* @throws IOException if fails to create the temporary directory or fails to write to local file
*/
@VisibleForTesting
static Tuple2<Long, File> writeToLocal(final Tuple2<String, String> oneBreakPoint, final String subStringToStripout) throws IOException {
final String fastqFilename = FilenameUtils.getName(oneBreakPoint._1());
final File localTempWorkingDir = Files.createTempDirectory(fastqFilename + "_").toAbsolutePath().toFile();
localTempWorkingDir.deleteOnExit();
final File localFASTQFile = new File(localTempWorkingDir, fastqFilename);
FileUtils.writeStringToFile(localFASTQFile, oneBreakPoint._2());
final Long breakpointID = Long.parseLong(FilenameUtils.getBaseName(oneBreakPoint._1()).replace(subStringToStripout, ""));
return new Tuple2<>(breakpointID, localFASTQFile);
}
use of scala.Tuple2 in project gatk by broadinstitute.
the class CoverageModelEMWorkspace method getCopyRatioSegmentsSpark.
/**
* Fetch copy ratio segments from compute blocks (Spark implementation)
*
* @return a list of {@link CopyRatioHMMResults}
*/
private List<List<HiddenStateSegmentRecord<STATE, Target>>> getCopyRatioSegmentsSpark() {
/* local final member variables for lambda capture */
final List<Target> processedTargetList = new ArrayList<>();
processedTargetList.addAll(this.processedTargetList);
final List<SexGenotypeData> processedSampleSexGenotypeData = new ArrayList<>();
processedSampleSexGenotypeData.addAll(this.processedSampleSexGenotypeData);
final List<String> processedSampleNameList = new ArrayList<>();
processedSampleNameList.addAll(this.processedSampleNameList);
final INDArray sampleReadDepths = Transforms.exp(sampleMeanLogReadDepths, true);
final CopyRatioExpectationsCalculator<CoverageModelCopyRatioEmissionData, STATE> copyRatioExpectationsCalculator = this.copyRatioExpectationsCalculator;
final BiFunction<SexGenotypeData, Target, STATE> referenceStateFactory = this.referenceStateFactory;
return fetchCopyRatioEmissionDataSpark().mapPartitionsToPair(it -> {
final List<Tuple2<Integer, CopyRatioHMMResults<CoverageModelCopyRatioEmissionData, STATE>>> newPartitionData = new ArrayList<>();
while (it.hasNext()) {
final Tuple2<Integer, List<CoverageModelCopyRatioEmissionData>> prevDatum = it.next();
final int sampleIndex = prevDatum._1;
final CopyRatioCallingMetadata copyRatioCallingMetadata = CopyRatioCallingMetadata.builder().sampleName(processedSampleNameList.get(sampleIndex)).sampleSexGenotypeData(processedSampleSexGenotypeData.get(sampleIndex)).sampleCoverageDepth(sampleReadDepths.getDouble(sampleIndex)).emissionCalculationStrategy(EmissionCalculationStrategy.HYBRID_POISSON_GAUSSIAN).build();
newPartitionData.add(new Tuple2<>(sampleIndex, copyRatioExpectationsCalculator.getCopyRatioHMMResults(copyRatioCallingMetadata, processedTargetList, prevDatum._2)));
}
return newPartitionData.iterator();
}, true).mapPartitionsToPair(it -> {
final List<Tuple2<Integer, List<HiddenStateSegmentRecord<STATE, Target>>>> newPartitionData = new ArrayList<>();
while (it.hasNext()) {
final Tuple2<Integer, CopyRatioHMMResults<CoverageModelCopyRatioEmissionData, STATE>> prevDatum = it.next();
final int sampleIndex = prevDatum._1;
final CopyRatioHMMResults<CoverageModelCopyRatioEmissionData, STATE> result = prevDatum._2;
final HMMSegmentProcessor<CoverageModelCopyRatioEmissionData, STATE, Target> processor = new HMMSegmentProcessor<>(Collections.singletonList(result.getMetaData().getSampleName()), Collections.singletonList(result.getMetaData().getSampleSexGenotypeData()), referenceStateFactory, Collections.singletonList(new HashedListTargetCollection<>(processedTargetList)), Collections.singletonList(result.getForwardBackwardResult()), Collections.singletonList(result.getViterbiResult()));
newPartitionData.add(new Tuple2<>(sampleIndex, processor.getSegmentsAsList()));
}
return newPartitionData.iterator();
}).collect().stream().sorted(Comparator.comparingInt(t -> t._1)).map(t -> t._2).collect(Collectors.toList());
}
use of scala.Tuple2 in project gatk by broadinstitute.
the class ShuffleJoinReadsWithRefBases method addBases.
/**
* Joins each read of an RDD<GATKRead, T> with key's corresponding reference sequence.
*
* @param referenceDataflowSource The source of the reference sequence information
* @param keyedByRead The read-keyed RDD for which to extract reference sequence information
* @return The JavaPairRDD that contains each read along with the corresponding ReferenceBases object and the value
*/
public static <T> JavaPairRDD<GATKRead, Tuple2<T, ReferenceBases>> addBases(final ReferenceMultiSource referenceDataflowSource, final JavaPairRDD<GATKRead, T> keyedByRead) {
SerializableFunction<GATKRead, SimpleInterval> windowFunction = referenceDataflowSource.getReferenceWindowFunction();
JavaPairRDD<ReferenceShard, Tuple2<GATKRead, T>> shardRead = keyedByRead.mapToPair(pair -> {
ReferenceShard shard = ReferenceShard.getShardNumberFromInterval(windowFunction.apply(pair._1()));
return new Tuple2<>(shard, pair);
});
JavaPairRDD<ReferenceShard, Iterable<Tuple2<GATKRead, T>>> shardiRead = shardRead.groupByKey();
return shardiRead.flatMapToPair(in -> {
List<Tuple2<GATKRead, Tuple2<T, ReferenceBases>>> out = Lists.newArrayList();
Iterable<Tuple2<GATKRead, T>> iReads = in._2();
final List<SimpleInterval> readWindows = Utils.stream(iReads).map(pair -> windowFunction.apply(pair._1())).collect(Collectors.toList());
SimpleInterval interval = IntervalUtils.getSpanningInterval(readWindows);
ReferenceBases bases = referenceDataflowSource.getReferenceBases(null, interval);
for (Tuple2<GATKRead, T> p : iReads) {
final ReferenceBases subset = bases.getSubset(windowFunction.apply(p._1()));
out.add(new Tuple2<>(p._1(), new Tuple2<>(p._2(), subset)));
}
return out.iterator();
});
}
use of scala.Tuple2 in project gatk by broadinstitute.
the class ShuffleJoinReadsWithVariants method pairReadsWithVariantShards.
private static JavaPairRDD<VariantShard, GATKRead> pairReadsWithVariantShards(final JavaRDD<GATKRead> reads) {
return reads.flatMapToPair(gatkRead -> {
List<VariantShard> shards = VariantShard.getVariantShardsFromInterval(gatkRead);
List<Tuple2<VariantShard, GATKRead>> out = Lists.newArrayList();
for (VariantShard shard : shards) {
out.add(new Tuple2<>(shard, gatkRead));
}
return out.iterator();
});
}
Aggregations