use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk-protected by broadinstitute.
the class AlleleFractionInitializer method initialMinorFractions.
/**
* Initialize minor fractions assuming no allelic bias <p></p>
*
* We integrate over f to get posterior probabilities (responsibilities) of alt / ref minor
* that is, responsibility of alt minor is int_{0 to 1/2} f^a (1-f)^r df
* responsibility of ref minor is int_{0 to 1/2} f^r (1-f)^a df
* these are proportional to I(1/2, a + 1, r + 1) and I(1/2, r + 1, a + 1),
* respectively, where I is the (incomplete) regularized Beta function.
* By definition these likelihoods sum to 1, ie they are already normalized. <p></p>
*
* Finally, we set each minor fraction to the responsibility-weighted total count of
* reads in minor allele divided by total reads, ignoring outliers.
*/
private AlleleFractionState.MinorFractions initialMinorFractions(final AlleleFractionData data) {
final int numSegments = data.getNumSegments();
final AlleleFractionState.MinorFractions result = new AlleleFractionState.MinorFractions(numSegments);
for (int segment = 0; segment < numSegments; segment++) {
double responsibilityWeightedMinorAlleleReadCount = 0.0;
double responsibilityWeightedTotalReadCount = 0.0;
for (final AllelicCount count : data.getCountsInSegment(segment)) {
final int a = count.getAltReadCount();
final int r = count.getRefReadCount();
double altMinorResponsibility;
try {
altMinorResponsibility = Beta.regularizedBeta(0.5, a + 1, r + 1);
} catch (final MaxCountExceededException e) {
//if the special function can't be computed, give an all-or-nothing responsibility
altMinorResponsibility = a < r ? 1.0 : 0.0;
}
responsibilityWeightedMinorAlleleReadCount += altMinorResponsibility * a + (1 - altMinorResponsibility) * r;
responsibilityWeightedTotalReadCount += a + r;
}
// we achieve a flat prior via a single pseudocount for minor and non-minor reads, hence the +1 and +2
result.add((responsibilityWeightedMinorAlleleReadCount + 1) / (responsibilityWeightedTotalReadCount + 2));
}
return result;
}
use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk-protected by broadinstitute.
the class IntegerCopyNumberTransitionProbabilityCacheUnitTest method testBasicSoundness.
@Test
public void testBasicSoundness() {
for (final RealMatrix transitionMatrix : TRANSITION_MATRICES) {
final IntegerCopyNumberTransitionProbabilityCache cache = new IntegerCopyNumberTransitionProbabilityCache(new IntegerCopyNumberTransitionMatrix(transitionMatrix, 0));
for (final int dist : DISTANCES) {
final RealMatrix transitionMatrixExponentiated = cache.getTransitionProbabilityMatrix(dist);
/* assert positivity */
Assert.assertTrue(Arrays.stream(transitionMatrixExponentiated.getData()).flatMapToDouble(Arrays::stream).allMatch(d -> d >= 0));
/* assert conservation of probability */
for (int c = 0; c < transitionMatrix.getColumnDimension(); c++) {
Assert.assertEquals(Arrays.stream(transitionMatrixExponentiated.getColumn(c)).sum(), 1.0, EPSILON);
}
/* assert correctness, T(2*d) = T(d)*T(d) */
assertEqualMatrices(cache.getTransitionProbabilityMatrix(2 * dist), transitionMatrixExponentiated.multiply(transitionMatrixExponentiated));
}
/* assert loss of initial state over long distances, i.e. all columns must be equal */
final RealMatrix longRangeTransitionMatrix = cache.getTransitionProbabilityMatrix(Integer.MAX_VALUE);
final double[] firstColumn = longRangeTransitionMatrix.getColumn(0);
final RealMatrix syntheticLongRangeTransitionMatrix = new Array2DRowRealMatrix(firstColumn.length, firstColumn.length);
for (int i = 0; i < firstColumn.length; i++) {
syntheticLongRangeTransitionMatrix.setColumn(i, firstColumn);
}
assertEqualMatrices(longRangeTransitionMatrix, syntheticLongRangeTransitionMatrix);
final double[] stationary = cache.getStationaryProbabilityVector().toArray();
ArrayAsserts.assertArrayEquals(stationary, firstColumn, EPSILON);
}
}
use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk-protected by broadinstitute.
the class IntegerCopyNumberTransitionMatrixUnitTest method testUnnormalizedProbability.
@Test
public void testUnnormalizedProbability() {
/* it should normalize unnormalized transition matrices and give a warning */
final IntegerCopyNumberTransitionMatrix transitionMatrix = new IntegerCopyNumberTransitionMatrix(new Array2DRowRealMatrix(new double[][] { { 1, 2, 3 }, { 4, 5, 6 }, { 7, 8, 9 } }), 0);
for (int i = 0; i < 3; i++) {
final double[] col = transitionMatrix.getTransitionMatrix().getColumn(i);
Assert.assertEquals(Arrays.stream(col).sum(), 1.0, 1e-12);
}
}
use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk-protected by broadinstitute.
the class TargetCoverageSexGenotypeCalculator method calculateSexGenotypeData.
/**
* Calculates the likelihood of different sex genotypes for a given sample index
*
* @param sampleIndex sample index
* @return an instance of {@link SexGenotypeData}
*/
private SexGenotypeData calculateSexGenotypeData(final int sampleIndex) {
final int[] allosomalReadCounts = Arrays.stream(processedReadCounts.getColumnOnSpecifiedTargets(sampleIndex, allosomalTargetList, true)).mapToInt(n -> (int) n).toArray();
final double readDepth = getSampleReadDepthFromAutosomalTargets(sampleIndex);
logger.debug("Read depth for " + processedReadCounts.columnNames().get(sampleIndex) + ": " + readDepth);
final List<Double> logLikelihoods = new ArrayList<>();
final List<String> sexGenotypesList = new ArrayList<>();
final int numAllosomalTargets = allosomalTargetList.size();
for (final String genotypeName : sexGenotypeIdentifiers) {
/* calculate log likelihood */
final int[] currentAllosomalTargetPloidies = allosomalTargetPloidies.get(genotypeName);
final double[] poissonParameters = IntStream.range(0, numAllosomalTargets).mapToDouble(i -> readDepth * (currentAllosomalTargetPloidies[i] > 0 ? currentAllosomalTargetPloidies[i] : baselineMappingErrorProbability)).toArray();
final double currentLogLikelihood = IntStream.range(0, numAllosomalTargets).mapToDouble(i -> {
final PoissonDistribution pois = new PoissonDistribution(poissonParameters[i]);
return pois.logProbability(allosomalReadCounts[i]);
}).sum();
sexGenotypesList.add(genotypeName);
logLikelihoods.add(currentLogLikelihood);
}
/* infer the most likely sex genotype */
final Integer[] indices = new Integer[sexGenotypesList.size()];
IntStream.range(0, sexGenotypesList.size()).forEach(i -> indices[i] = i);
Arrays.sort(indices, (li, ri) -> Double.compare(logLikelihoods.get(ri), logLikelihoods.get(li)));
return new SexGenotypeData(processedReadCounts.columnNames().get(sampleIndex), sexGenotypesList.get(indices[0]), sexGenotypesList, logLikelihoods.stream().mapToDouble(d -> d).toArray());
}
use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk-protected by broadinstitute.
the class HetPulldownCalculator method getHetPulldown.
/**
* For a normal or tumor sample, returns a data structure giving (intervals, reference counts, alternate counts),
* where intervals give positions of likely heterozygous SNP sites.
*
* <p>
* For a normal sample:
* <ul>
* The IntervalList snpIntervals gives common SNP sites in 1-based format.
* </ul>
* <ul>
* The p-value threshold must be specified for a two-sided binomial test,
* which is used to determine SNP sites from snpIntervals that are
* compatible with a heterozygous SNP, given the sample. Only these sites are output.
* </ul>
* </p>
* <p>
* For a tumor sample:
* <ul>
* The IntervalList snpIntervals gives heterozygous SNP sites likely to be present in the normal sample.
* This should be from {@link HetPulldownCalculator#getNormal} in 1-based format.
* Only these sites are output.
* </ul>
* </p>
* @param bamFile sorted BAM file for sample
* @param snpIntervals IntervalList of SNP sites
* @param sampleType flag indicating type of sample (SampleType.NORMAL or SampleType.TUMOR)
* (determines whether to perform binomial test)
* @param pvalThreshold p-value threshold for two-sided binomial test, used for normal sample
* @param minimumRawReads minimum number of total reads that must be present at a het site
* @return Pulldown of heterozygous SNP sites in 1-based format
*/
private Pulldown getHetPulldown(final File bamFile, final IntervalList snpIntervals, final SampleType sampleType, final double pvalThreshold, final int minimumRawReads) {
try (final SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(refFile).open(bamFile);
final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(refFile)) {
if (bamReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
}
final Pulldown hetPulldown = new Pulldown(bamReader.getFileHeader());
final int totalNumberOfSNPs = snpIntervals.size();
final SamLocusIterator locusIterator = new SamLocusIterator(bamReader, snpIntervals, totalNumberOfSNPs < MAX_INTERVALS_FOR_INDEX);
//set read and locus filters [note: read counts match IGV, but off by a few from pysam.mpileup]
final List<SamRecordFilter> samFilters = Arrays.asList(new NotPrimaryAlignmentFilter(), new DuplicateReadFilter());
locusIterator.setSamFilters(samFilters);
locusIterator.setEmitUncoveredLoci(false);
locusIterator.setIncludeNonPfReads(false);
locusIterator.setMappingQualityScoreCutoff(minMappingQuality);
locusIterator.setQualityScoreCutoff(minBaseQuality);
logger.info("Examining " + totalNumberOfSNPs + " sites in total...");
int locusCount = 0;
for (final SamLocusIterator.LocusInfo locus : locusIterator) {
if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
logger.info("Examined " + locusCount + " covered sites.");
}
locusCount++;
//include N, etc. reads here
final int totalReadCount = locus.getRecordAndOffsets().size();
if (totalReadCount < minimumRawReads) {
continue;
}
final Nucleotide.Counter baseCounts = getPileupBaseCounts(locus);
//only include total ACGT counts in binomial test (exclude N, etc.)
final int totalBaseCount = Arrays.stream(BASES).mapToInt(b -> (int) baseCounts.get(b)).sum();
if (sampleType == SampleType.NORMAL && !isPileupHetCompatible(baseCounts, totalBaseCount, pvalThreshold)) {
continue;
}
final Nucleotide refBase = Nucleotide.valueOf(refWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]);
final int refReadCount = (int) baseCounts.get(refBase);
final int altReadCount = totalBaseCount - refReadCount;
hetPulldown.add(new AllelicCount(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), refReadCount, altReadCount));
}
logger.info(locusCount + " covered sites out of " + totalNumberOfSNPs + " total sites were examined.");
return hetPulldown;
} catch (final IOException | SAMFormatException e) {
throw new UserException(e.getMessage());
}
}
Aggregations