use of org.broadinstitute.hellbender.utils.hmm.segmentation.HiddenStateSegmentRecord in project gatk by broadinstitute.
the class CoverageModelEMWorkspace method getCopyRatioSegmentsSpark.
/**
* Fetch copy ratio segments from compute blocks (Spark implementation)
*
* @return a list of {@link CopyRatioHMMResults}
*/
private List<List<HiddenStateSegmentRecord<STATE, Target>>> getCopyRatioSegmentsSpark() {
/* local final member variables for lambda capture */
final List<Target> processedTargetList = new ArrayList<>();
processedTargetList.addAll(this.processedTargetList);
final List<SexGenotypeData> processedSampleSexGenotypeData = new ArrayList<>();
processedSampleSexGenotypeData.addAll(this.processedSampleSexGenotypeData);
final List<String> processedSampleNameList = new ArrayList<>();
processedSampleNameList.addAll(this.processedSampleNameList);
final INDArray sampleReadDepths = Transforms.exp(sampleMeanLogReadDepths, true);
final CopyRatioExpectationsCalculator<CoverageModelCopyRatioEmissionData, STATE> copyRatioExpectationsCalculator = this.copyRatioExpectationsCalculator;
final BiFunction<SexGenotypeData, Target, STATE> referenceStateFactory = this.referenceStateFactory;
return fetchCopyRatioEmissionDataSpark().mapPartitionsToPair(it -> {
final List<Tuple2<Integer, CopyRatioHMMResults<CoverageModelCopyRatioEmissionData, STATE>>> newPartitionData = new ArrayList<>();
while (it.hasNext()) {
final Tuple2<Integer, List<CoverageModelCopyRatioEmissionData>> prevDatum = it.next();
final int sampleIndex = prevDatum._1;
final CopyRatioCallingMetadata copyRatioCallingMetadata = CopyRatioCallingMetadata.builder().sampleName(processedSampleNameList.get(sampleIndex)).sampleSexGenotypeData(processedSampleSexGenotypeData.get(sampleIndex)).sampleCoverageDepth(sampleReadDepths.getDouble(sampleIndex)).emissionCalculationStrategy(EmissionCalculationStrategy.HYBRID_POISSON_GAUSSIAN).build();
newPartitionData.add(new Tuple2<>(sampleIndex, copyRatioExpectationsCalculator.getCopyRatioHMMResults(copyRatioCallingMetadata, processedTargetList, prevDatum._2)));
}
return newPartitionData.iterator();
}, true).mapPartitionsToPair(it -> {
final List<Tuple2<Integer, List<HiddenStateSegmentRecord<STATE, Target>>>> newPartitionData = new ArrayList<>();
while (it.hasNext()) {
final Tuple2<Integer, CopyRatioHMMResults<CoverageModelCopyRatioEmissionData, STATE>> prevDatum = it.next();
final int sampleIndex = prevDatum._1;
final CopyRatioHMMResults<CoverageModelCopyRatioEmissionData, STATE> result = prevDatum._2;
final HMMSegmentProcessor<CoverageModelCopyRatioEmissionData, STATE, Target> processor = new HMMSegmentProcessor<>(Collections.singletonList(result.getMetaData().getSampleName()), Collections.singletonList(result.getMetaData().getSampleSexGenotypeData()), referenceStateFactory, Collections.singletonList(new HashedListTargetCollection<>(processedTargetList)), Collections.singletonList(result.getForwardBackwardResult()), Collections.singletonList(result.getViterbiResult()));
newPartitionData.add(new Tuple2<>(sampleIndex, processor.getSegmentsAsList()));
}
return newPartitionData.iterator();
}).collect().stream().sorted(Comparator.comparingInt(t -> t._1)).map(t -> t._2).collect(Collectors.toList());
}
use of org.broadinstitute.hellbender.utils.hmm.segmentation.HiddenStateSegmentRecord in project gatk by broadinstitute.
the class ConvertGSVariantsToSegments method apply.
@Override
public void apply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) {
final SimpleInterval interval = new SimpleInterval(variant);
final int targetCount = targets.indexRange(interval).size();
final int[] callCounts = new int[CopyNumberTriState.values().length];
for (final Genotype genotype : variant.getGenotypes().iterateInSampleNameOrder()) {
final String sample = genotype.getSampleName();
final double mean = doubleFrom(genotype.getExtendedAttribute(GS_COPY_NUMBER_FRACTION));
final int copyNumber = intFrom(genotype.getExtendedAttribute(GS_COPY_NUMBER_FORMAT));
final CopyNumberTriState call = copyNumber == neutralCopyNumber ? CopyNumberTriState.NEUTRAL : (copyNumber < neutralCopyNumber) ? CopyNumberTriState.DELETION : CopyNumberTriState.DUPLICATION;
callCounts[call.ordinal()]++;
final double[] probs = doubleArrayFrom(genotype.getExtendedAttribute(GS_COPY_NUMBER_POSTERIOR));
final double log10PostQualCall = calculateLog10CallQuality(probs, call);
final double log10PostQualNonRef = calculateLog10CallQualityNonRef(probs);
final double phredProbCall = -10.0 * log10PostQualCall;
final double phredProbNonRef = -10.0 * log10PostQualNonRef;
final HiddenStateSegment<CopyNumberTriState, Target> segment = new HiddenStateSegment<>(interval, targetCount, mean, // GS VCF does not contain any stddev or var estimate for coverage fraction.
0.0, call, // GS does not provide an EQ, we approximate it to be the 1 - sum of all call compatible CN corresponding posterior probs
phredProbCall, // GS does not provide a SQ, we leave is a NaN.
Double.NaN, // GS does not provide a START Q.
Double.NaN, // GS does not provide a END Q.
Double.NaN, phredProbNonRef);
final HiddenStateSegmentRecord<CopyNumberTriState, Target> record = new HiddenStateSegmentRecord<>(sample, segment);
try {
outputWriter.writeRecord(record);
} catch (final IOException ex) {
throw new UserException.CouldNotCreateOutputFile(outputFile, ex);
}
}
}
use of org.broadinstitute.hellbender.utils.hmm.segmentation.HiddenStateSegmentRecord in project gatk by broadinstitute.
the class XHMMSegmentCallerIntegrationTest method testRunCommandLine.
//TODO: this test used to contain a test of concordance with XHMM. It no longer does that because our model has
//TODO: diverged from XHMM's. Eventually the right thing to do is use the simulateChain() method to generate
//TODO: simulated data for some artificial set of CNV segments and to test concordance with those segments.
//TODO: however we still use XHMM's emission model, which is both not generative and quite silly. Once we
//TODO: have a generative model of coverage we can modify simulateChain() accordingly and then write a concordance
//TODO: test here. Until then, we do not have an integration test but we do have our ongoing evaluations, which
//TODO: show the superiority of our modifications versus the original XHMM model.
@Test(dataProvider = "simulatedChainData")
public File testRunCommandLine(final XHMMData chain) {
final File inputFile = writeChainInTempFile(chain);
final File outputFile = createTempFile("output", ".tab");
runCommandLine(chain, inputFile, outputFile);
Assert.assertTrue(outputFile.exists());
final TargetCollection<Target> targets = TargetArgumentCollection.readTargetCollection(REALISTIC_TARGETS_FILE);
final List<HiddenStateSegmentRecord<CopyNumberTriState, Target>> outputRecords = readOutputRecords(outputFile);
assertOutputIsInOrder(outputRecords, targets);
assertOutputHasConsistentNumberOfTargets(outputRecords, targets);
final Map<String, List<HiddenStateSegmentRecord<CopyNumberTriState, Target>>> outputRecordsBySample = splitOutputRecordBySample(outputRecords);
assertSampleNames(outputRecordsBySample.keySet(), chain);
for (final List<HiddenStateSegmentRecord<CopyNumberTriState, Target>> sampleRecords : outputRecordsBySample.values()) {
assertSampleSegmentsCoverAllTargets(sampleRecords, targets);
assertSampleSegmentsCoordinates(sampleRecords, targets);
}
return outputFile;
}
use of org.broadinstitute.hellbender.utils.hmm.segmentation.HiddenStateSegmentRecord in project gatk by broadinstitute.
the class XHMMSegmentGenotyperIntegrationTest method assertVariantsAreCoveredBySegments.
private void assertVariantsAreCoveredBySegments(final List<VariantContext> variants, final List<HiddenStateSegmentRecord<CopyNumberTriState, Target>> variantSegments) {
for (final VariantContext variant : variants) {
final List<HiddenStateSegmentRecord<CopyNumberTriState, Target>> matches = variantSegments.stream().filter(s -> new SimpleInterval(variant).equals(s.getSegment().getInterval())).collect(Collectors.toList());
Assert.assertFalse(matches.isEmpty());
for (final Genotype genotype : variant.getGenotypes()) {
final boolean discovery = genotype.getExtendedAttribute(XHMMSegmentGenotyper.DISCOVERY_KEY).toString().equals(XHMMSegmentGenotyper.DISCOVERY_TRUE);
if (discovery) {
Assert.assertTrue(matches.stream().anyMatch(s -> s.getSampleName().equals(genotype.getSampleName())));
} else {
Assert.assertTrue(matches.stream().noneMatch(s -> s.getSampleName().equals(genotype.getSampleName())));
}
}
}
}
use of org.broadinstitute.hellbender.utils.hmm.segmentation.HiddenStateSegmentRecord in project gatk-protected by broadinstitute.
the class XHMMSegmentGenotyperIntegrationTest method assertVariantSegmentsAreCovered.
private void assertVariantSegmentsAreCovered(final List<VariantContext> variants, final List<HiddenStateSegmentRecord<CopyNumberTriState, Target>> variantSegments) {
for (final HiddenStateSegmentRecord<CopyNumberTriState, Target> variantSegment : variantSegments) {
final Optional<VariantContext> match = variants.stream().filter(vc -> new SimpleInterval(vc).equals(variantSegment.getSegment().getInterval())).findFirst();
Assert.assertTrue(match.isPresent());
final VariantContext matchedVariant = match.get();
final Genotype genotype = matchedVariant.getGenotype(variantSegment.getSampleName());
final String discovery = genotype.getAnyAttribute(XHMMSegmentGenotyper.DISCOVERY_KEY).toString();
Assert.assertTrue(discovery.equals(XHMMSegmentGenotyper.DISCOVERY_TRUE));
final CopyNumberTriState call = variantSegment.getSegment().getCall();
final List<Allele> gt = genotype.getAlleles();
Assert.assertEquals(gt.size(), 1);
// The call may not be the same for case where the event-quality is relatively low.
if (variantSegment.getSegment().getEventQuality() > 10) {
Assert.assertEquals(CopyNumberTriStateAllele.valueOf(gt.get(0)).state, call, genotype.toString());
}
final String[] SQ = genotype.getAnyAttribute(XHMMSegmentGenotyper.SOME_QUALITY_KEY).toString().split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR);
final double someQual = variantSegment.getSegment().getSomeQuality();
Assert.assertEquals(Double.parseDouble(SQ[call == CopyNumberTriState.DELETION ? 0 : 1]), someQual, XHMMSegmentGenotyper.PHRED_SCORE_PRECISION, variantSegment.getSampleName() + " => " + genotype.toString());
final String[] LQ = genotype.getAnyAttribute(XHMMSegmentGenotyper.START_QUALITY_KEY).toString().split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR);
final double startQuality = variantSegment.getSegment().getStartQuality();
Assert.assertEquals(Double.parseDouble(LQ[call == CopyNumberTriState.DELETION ? 0 : 1]), startQuality, XHMMSegmentGenotyper.PHRED_SCORE_PRECISION, variantSegment.getSampleName() + " => " + genotype.toString());
final String[] RQ = genotype.getAnyAttribute(XHMMSegmentGenotyper.END_QUALITY_KEY).toString().split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR);
final double endQuality = variantSegment.getSegment().getEndQuality();
Assert.assertEquals(Double.parseDouble(RQ[call == CopyNumberTriState.DELETION ? 0 : 1]), endQuality, XHMMSegmentGenotyper.PHRED_SCORE_PRECISION, variantSegment.getSampleName() + " => " + genotype.toString());
// Check the PL.
final int[] PL = genotype.getPL();
final int observedGQFromPL = Math.min(XHMMSegmentGenotyper.MAX_GQ, PL[CopyNumberTriStateAllele.REF.index()] - PL[CopyNumberTriStateAllele.valueOf(call).index()]);
final double expectedCallPL = GATKProtectedMathUtils.roundPhred(QualityUtils.phredScaleErrorRate(QualityUtils.qualToProb(variantSegment.getSegment().getExactQuality())), HMMPostProcessor.PHRED_SCORE_PRECISION);
final double expectedRefPL = GATKProtectedMathUtils.roundPhred(QualityUtils.phredScaleCorrectRate(QualityUtils.qualToProb(variantSegment.getSegment().getEventQuality())), HMMPostProcessor.PHRED_SCORE_PRECISION);
final int expectedGQFromPL = Math.min(XHMMSegmentGenotyper.MAX_GQ, (int) Math.round(expectedRefPL - expectedCallPL));
Assert.assertTrue(Math.abs(observedGQFromPL - expectedGQFromPL) <= 1, genotype.toString() + " " + variantSegment.getSegment().getEventQuality());
}
}
Aggregations