use of org.broadinstitute.hellbender.tools.coveragemodel.interfaces.CopyRatioExpectationsCalculator in project gatk by broadinstitute.
the class CoverageModelEMWorkspace method updateCopyRatioPosteriorExpectationsSpark.
/**
* The Spark implementation of the E-step update of copy ratio posteriors
*
* @return a {@link SubroutineSignal} containing the update size
*/
@EvaluatesRDD
@UpdatesRDD
@CachesRDD
private SubroutineSignal updateCopyRatioPosteriorExpectationsSpark(final double admixingRatio) {
/* local final member variables for lambda capture */
final List<LinearlySpacedIndexBlock> targetBlocks = new ArrayList<>();
targetBlocks.addAll(this.targetBlocks);
final List<Target> targetList = new ArrayList<>();
targetList.addAll(processedTargetList);
final List<String> sampleNameList = new ArrayList<>();
sampleNameList.addAll(processedSampleNameList);
final List<SexGenotypeData> sampleSexGenotypeData = new ArrayList<>();
sampleSexGenotypeData.addAll(processedSampleSexGenotypeData);
final int numTargetBlocks = targetBlocks.size();
final CopyRatioExpectationsCalculator<CoverageModelCopyRatioEmissionData, STATE> calculator = this.copyRatioExpectationsCalculator;
final INDArray sampleReadDepths = Transforms.exp(sampleMeanLogReadDepths, true);
/* make an RDD of copy ratio posterior expectations */
final JavaPairRDD<Integer, CopyRatioExpectations> copyRatioPosteriorExpectationsPairRDD = /* fetch copy ratio emission data from workers */
fetchCopyRatioEmissionDataSpark().mapPartitionsToPair(it -> {
final List<Tuple2<Integer, CopyRatioExpectations>> newPartitionData = new ArrayList<>();
while (it.hasNext()) {
final Tuple2<Integer, List<CoverageModelCopyRatioEmissionData>> prevDatum = it.next();
final int si = prevDatum._1;
final CopyRatioCallingMetadata copyRatioCallingMetadata = CopyRatioCallingMetadata.builder().sampleName(sampleNameList.get(si)).sampleSexGenotypeData(sampleSexGenotypeData.get(si)).sampleCoverageDepth(sampleReadDepths.getDouble(si)).emissionCalculationStrategy(EmissionCalculationStrategy.HYBRID_POISSON_GAUSSIAN).build();
newPartitionData.add(new Tuple2<>(prevDatum._1, calculator.getCopyRatioPosteriorExpectations(copyRatioCallingMetadata, targetList, prevDatum._2)));
}
return newPartitionData.iterator();
}, true);
/* we need to do two things to copyRatioPosteriorExpectationsPairRDD; so we cache it */
/* step 1. update log chain posterior expectation on the driver node */
final double[] newSampleLogChainPosteriors = copyRatioPosteriorExpectationsPairRDD.mapValues(CopyRatioExpectations::getLogChainPosteriorProbability).collect().stream().sorted(Comparator.comparingInt(t -> t._1)).mapToDouble(t -> t._2).toArray();
sampleLogChainPosteriors.assign(Nd4j.create(newSampleLogChainPosteriors, new int[] { numSamples, 1 }));
/* step 2. repartition in target space */
final JavaPairRDD<LinearlySpacedIndexBlock, ImmutablePair<INDArray, INDArray>> blockifiedCopyRatioPosteriorResultsPairRDD = copyRatioPosteriorExpectationsPairRDD.flatMapToPair(dat -> targetBlocks.stream().map(tb -> new Tuple2<>(tb, new Tuple2<>(dat._1, ImmutablePair.of(dat._2.getLogCopyRatioMeans(tb), dat._2.getLogCopyRatioVariances(tb))))).iterator()).combineByKey(/* recipe to create an singleton list */
Collections::singletonList, /* recipe to add an element to the list */
(list, element) -> Stream.concat(list.stream(), Stream.of(element)).collect(Collectors.toList()), /* recipe to concatenate two lists */
(list1, list2) -> Stream.concat(list1.stream(), list2.stream()).collect(Collectors.toList()), /* repartition with respect to target space blocks */
new HashPartitioner(numTargetBlocks)).mapValues(list -> list.stream().sorted(Comparator.comparingInt(t -> t._1)).map(p -> p._2).map(t -> ImmutablePair.of(Nd4j.create(t.left), Nd4j.create(t.right))).collect(Collectors.toList())).mapValues(CoverageModelEMWorkspace::stackCopyRatioPosteriorDataForAllSamples);
/* we do not need copy ratio expectations anymore */
copyRatioPosteriorExpectationsPairRDD.unpersist();
/* step 3. merge with computeRDD and update */
computeRDD = computeRDD.join(blockifiedCopyRatioPosteriorResultsPairRDD).mapValues(t -> t._1.cloneWithUpdatedCopyRatioPosteriors(t._2.left, t._2.right, admixingRatio));
cacheWorkers("after E-step for copy ratio update");
/* collect subroutine signals */
final List<SubroutineSignal> sigs = mapWorkersAndCollect(CoverageModelEMComputeBlock::getLatestMStepSignal);
final double errorNormInfinity = Collections.max(sigs.stream().map(sig -> sig.<Double>get(StandardSubroutineSignals.RESIDUAL_ERROR_NORM)).collect(Collectors.toList()));
return SubroutineSignal.builder().put(StandardSubroutineSignals.RESIDUAL_ERROR_NORM, errorNormInfinity).build();
}
use of org.broadinstitute.hellbender.tools.coveragemodel.interfaces.CopyRatioExpectationsCalculator in project gatk-protected by broadinstitute.
the class CoverageModelEMWorkspace method updateCopyRatioPosteriorExpectationsSpark.
/**
* The Spark implementation of the E-step update of copy ratio posteriors
*
* @return a {@link SubroutineSignal} containing the update size
*/
@EvaluatesRDD
@UpdatesRDD
@CachesRDD
private SubroutineSignal updateCopyRatioPosteriorExpectationsSpark(final double admixingRatio) {
/* local final member variables for lambda capture */
final List<LinearlySpacedIndexBlock> targetBlocks = new ArrayList<>();
targetBlocks.addAll(this.targetBlocks);
final List<Target> targetList = new ArrayList<>();
targetList.addAll(processedTargetList);
final List<String> sampleNameList = new ArrayList<>();
sampleNameList.addAll(processedSampleNameList);
final List<SexGenotypeData> sampleSexGenotypeData = new ArrayList<>();
sampleSexGenotypeData.addAll(processedSampleSexGenotypeData);
final int numTargetBlocks = targetBlocks.size();
final CopyRatioExpectationsCalculator<CoverageModelCopyRatioEmissionData, STATE> calculator = this.copyRatioExpectationsCalculator;
final INDArray sampleReadDepths = Transforms.exp(sampleMeanLogReadDepths, true);
/* make an RDD of copy ratio posterior expectations */
final JavaPairRDD<Integer, CopyRatioExpectations> copyRatioPosteriorExpectationsPairRDD = /* fetch copy ratio emission data from workers */
fetchCopyRatioEmissionDataSpark().mapPartitionsToPair(it -> {
final List<Tuple2<Integer, CopyRatioExpectations>> newPartitionData = new ArrayList<>();
while (it.hasNext()) {
final Tuple2<Integer, List<CoverageModelCopyRatioEmissionData>> prevDatum = it.next();
final int si = prevDatum._1;
final CopyRatioCallingMetadata copyRatioCallingMetadata = CopyRatioCallingMetadata.builder().sampleName(sampleNameList.get(si)).sampleSexGenotypeData(sampleSexGenotypeData.get(si)).sampleCoverageDepth(sampleReadDepths.getDouble(si)).emissionCalculationStrategy(EmissionCalculationStrategy.HYBRID_POISSON_GAUSSIAN).build();
newPartitionData.add(new Tuple2<>(prevDatum._1, calculator.getCopyRatioPosteriorExpectations(copyRatioCallingMetadata, targetList, prevDatum._2)));
}
return newPartitionData.iterator();
}, true);
/* we need to do two things to copyRatioPosteriorExpectationsPairRDD; so we cache it */
/* step 1. update log chain posterior expectation on the driver node */
final double[] newSampleLogChainPosteriors = copyRatioPosteriorExpectationsPairRDD.mapValues(CopyRatioExpectations::getLogChainPosteriorProbability).collect().stream().sorted(Comparator.comparingInt(t -> t._1)).mapToDouble(t -> t._2).toArray();
sampleLogChainPosteriors.assign(Nd4j.create(newSampleLogChainPosteriors, new int[] { numSamples, 1 }));
/* step 2. repartition in target space */
final JavaPairRDD<LinearlySpacedIndexBlock, ImmutablePair<INDArray, INDArray>> blockifiedCopyRatioPosteriorResultsPairRDD = copyRatioPosteriorExpectationsPairRDD.flatMapToPair(dat -> targetBlocks.stream().map(tb -> new Tuple2<>(tb, new Tuple2<>(dat._1, ImmutablePair.of(dat._2.getLogCopyRatioMeans(tb), dat._2.getLogCopyRatioVariances(tb))))).iterator()).combineByKey(/* recipe to create an singleton list */
Collections::singletonList, /* recipe to add an element to the list */
(list, element) -> Stream.concat(list.stream(), Stream.of(element)).collect(Collectors.toList()), /* recipe to concatenate two lists */
(list1, list2) -> Stream.concat(list1.stream(), list2.stream()).collect(Collectors.toList()), /* repartition with respect to target space blocks */
new HashPartitioner(numTargetBlocks)).mapValues(list -> list.stream().sorted(Comparator.comparingInt(t -> t._1)).map(p -> p._2).map(t -> ImmutablePair.of(Nd4j.create(t.left), Nd4j.create(t.right))).collect(Collectors.toList())).mapValues(CoverageModelEMWorkspace::stackCopyRatioPosteriorDataForAllSamples);
/* we do not need copy ratio expectations anymore */
copyRatioPosteriorExpectationsPairRDD.unpersist();
/* step 3. merge with computeRDD and update */
computeRDD = computeRDD.join(blockifiedCopyRatioPosteriorResultsPairRDD).mapValues(t -> t._1.cloneWithUpdatedCopyRatioPosteriors(t._2.left, t._2.right, admixingRatio));
cacheWorkers("after E-step for copy ratio update");
/* collect subroutine signals */
final List<SubroutineSignal> sigs = mapWorkersAndCollect(CoverageModelEMComputeBlock::getLatestMStepSignal);
final double errorNormInfinity = Collections.max(sigs.stream().map(sig -> sig.<Double>get(StandardSubroutineSignals.RESIDUAL_ERROR_NORM)).collect(Collectors.toList()));
return SubroutineSignal.builder().put(StandardSubroutineSignals.RESIDUAL_ERROR_NORM, errorNormInfinity).build();
}
use of org.broadinstitute.hellbender.tools.coveragemodel.interfaces.CopyRatioExpectationsCalculator in project gatk by broadinstitute.
the class CoverageModelEMWorkspace method getCopyRatioSegmentsSpark.
/**
* Fetch copy ratio segments from compute blocks (Spark implementation)
*
* @return a list of {@link CopyRatioHMMResults}
*/
private List<List<HiddenStateSegmentRecord<STATE, Target>>> getCopyRatioSegmentsSpark() {
/* local final member variables for lambda capture */
final List<Target> processedTargetList = new ArrayList<>();
processedTargetList.addAll(this.processedTargetList);
final List<SexGenotypeData> processedSampleSexGenotypeData = new ArrayList<>();
processedSampleSexGenotypeData.addAll(this.processedSampleSexGenotypeData);
final List<String> processedSampleNameList = new ArrayList<>();
processedSampleNameList.addAll(this.processedSampleNameList);
final INDArray sampleReadDepths = Transforms.exp(sampleMeanLogReadDepths, true);
final CopyRatioExpectationsCalculator<CoverageModelCopyRatioEmissionData, STATE> copyRatioExpectationsCalculator = this.copyRatioExpectationsCalculator;
final BiFunction<SexGenotypeData, Target, STATE> referenceStateFactory = this.referenceStateFactory;
return fetchCopyRatioEmissionDataSpark().mapPartitionsToPair(it -> {
final List<Tuple2<Integer, CopyRatioHMMResults<CoverageModelCopyRatioEmissionData, STATE>>> newPartitionData = new ArrayList<>();
while (it.hasNext()) {
final Tuple2<Integer, List<CoverageModelCopyRatioEmissionData>> prevDatum = it.next();
final int sampleIndex = prevDatum._1;
final CopyRatioCallingMetadata copyRatioCallingMetadata = CopyRatioCallingMetadata.builder().sampleName(processedSampleNameList.get(sampleIndex)).sampleSexGenotypeData(processedSampleSexGenotypeData.get(sampleIndex)).sampleCoverageDepth(sampleReadDepths.getDouble(sampleIndex)).emissionCalculationStrategy(EmissionCalculationStrategy.HYBRID_POISSON_GAUSSIAN).build();
newPartitionData.add(new Tuple2<>(sampleIndex, copyRatioExpectationsCalculator.getCopyRatioHMMResults(copyRatioCallingMetadata, processedTargetList, prevDatum._2)));
}
return newPartitionData.iterator();
}, true).mapPartitionsToPair(it -> {
final List<Tuple2<Integer, List<HiddenStateSegmentRecord<STATE, Target>>>> newPartitionData = new ArrayList<>();
while (it.hasNext()) {
final Tuple2<Integer, CopyRatioHMMResults<CoverageModelCopyRatioEmissionData, STATE>> prevDatum = it.next();
final int sampleIndex = prevDatum._1;
final CopyRatioHMMResults<CoverageModelCopyRatioEmissionData, STATE> result = prevDatum._2;
final HMMSegmentProcessor<CoverageModelCopyRatioEmissionData, STATE, Target> processor = new HMMSegmentProcessor<>(Collections.singletonList(result.getMetaData().getSampleName()), Collections.singletonList(result.getMetaData().getSampleSexGenotypeData()), referenceStateFactory, Collections.singletonList(new HashedListTargetCollection<>(processedTargetList)), Collections.singletonList(result.getForwardBackwardResult()), Collections.singletonList(result.getViterbiResult()));
newPartitionData.add(new Tuple2<>(sampleIndex, processor.getSegmentsAsList()));
}
return newPartitionData.iterator();
}).collect().stream().sorted(Comparator.comparingInt(t -> t._1)).map(t -> t._2).collect(Collectors.toList());
}
use of org.broadinstitute.hellbender.tools.coveragemodel.interfaces.CopyRatioExpectationsCalculator in project gatk-protected by broadinstitute.
the class CoverageModelEMWorkspace method getCopyRatioSegmentsSpark.
/**
* Fetch copy ratio segments from compute blocks (Spark implementation)
*
* @return a list of {@link CopyRatioHMMResults}
*/
private List<List<HiddenStateSegmentRecord<STATE, Target>>> getCopyRatioSegmentsSpark() {
/* local final member variables for lambda capture */
final List<Target> processedTargetList = new ArrayList<>();
processedTargetList.addAll(this.processedTargetList);
final List<SexGenotypeData> processedSampleSexGenotypeData = new ArrayList<>();
processedSampleSexGenotypeData.addAll(this.processedSampleSexGenotypeData);
final List<String> processedSampleNameList = new ArrayList<>();
processedSampleNameList.addAll(this.processedSampleNameList);
final INDArray sampleReadDepths = Transforms.exp(sampleMeanLogReadDepths, true);
final CopyRatioExpectationsCalculator<CoverageModelCopyRatioEmissionData, STATE> copyRatioExpectationsCalculator = this.copyRatioExpectationsCalculator;
final BiFunction<SexGenotypeData, Target, STATE> referenceStateFactory = this.referenceStateFactory;
return fetchCopyRatioEmissionDataSpark().mapPartitionsToPair(it -> {
final List<Tuple2<Integer, CopyRatioHMMResults<CoverageModelCopyRatioEmissionData, STATE>>> newPartitionData = new ArrayList<>();
while (it.hasNext()) {
final Tuple2<Integer, List<CoverageModelCopyRatioEmissionData>> prevDatum = it.next();
final int sampleIndex = prevDatum._1;
final CopyRatioCallingMetadata copyRatioCallingMetadata = CopyRatioCallingMetadata.builder().sampleName(processedSampleNameList.get(sampleIndex)).sampleSexGenotypeData(processedSampleSexGenotypeData.get(sampleIndex)).sampleCoverageDepth(sampleReadDepths.getDouble(sampleIndex)).emissionCalculationStrategy(EmissionCalculationStrategy.HYBRID_POISSON_GAUSSIAN).build();
newPartitionData.add(new Tuple2<>(sampleIndex, copyRatioExpectationsCalculator.getCopyRatioHMMResults(copyRatioCallingMetadata, processedTargetList, prevDatum._2)));
}
return newPartitionData.iterator();
}, true).mapPartitionsToPair(it -> {
final List<Tuple2<Integer, List<HiddenStateSegmentRecord<STATE, Target>>>> newPartitionData = new ArrayList<>();
while (it.hasNext()) {
final Tuple2<Integer, CopyRatioHMMResults<CoverageModelCopyRatioEmissionData, STATE>> prevDatum = it.next();
final int sampleIndex = prevDatum._1;
final CopyRatioHMMResults<CoverageModelCopyRatioEmissionData, STATE> result = prevDatum._2;
final HMMSegmentProcessor<CoverageModelCopyRatioEmissionData, STATE, Target> processor = new HMMSegmentProcessor<>(Collections.singletonList(result.getMetaData().getSampleName()), Collections.singletonList(result.getMetaData().getSampleSexGenotypeData()), referenceStateFactory, Collections.singletonList(new HashedListTargetCollection<>(processedTargetList)), Collections.singletonList(result.getForwardBackwardResult()), Collections.singletonList(result.getViterbiResult()));
newPartitionData.add(new Tuple2<>(sampleIndex, processor.getSegmentsAsList()));
}
return newPartitionData.iterator();
}).collect().stream().sorted(Comparator.comparingInt(t -> t._1)).map(t -> t._2).collect(Collectors.toList());
}
Aggregations