use of org.apache.commons.math3.util.Pair in project gatk-protected by broadinstitute.
the class GCBiasSimulatedData method simulatedData.
// visible for the integration test
public static Pair<ReadCountCollection, double[]> simulatedData(final int numTargets, final int numSamples) {
final List<Target> phonyTargets = SimulatedTargets.phonyTargets(numTargets);
final List<String> phonySamples = SimulatedSamples.phonySamples(numSamples);
final Random random = new Random(13);
final double[] gcContentByTarget = IntStream.range(0, numTargets).mapToDouble(n -> 0.5 + 0.2 * random.nextGaussian()).map(x -> Math.min(x, 0.95)).map(x -> Math.max(x, 0.05)).toArray();
final double[] gcBiasByTarget = Arrays.stream(gcContentByTarget).map(QUADRATIC_GC_BIAS_CURVE::apply).toArray();
// model mainly GC bias with a small random amount of non-GC bias
// thus noise after GC correction should be nearly zero
final RealMatrix counts = new Array2DRowRealMatrix(numTargets, numSamples);
counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(final int target, final int column, final double value) {
return gcBiasByTarget[target] * (1.0 + 0.01 * random.nextDouble());
}
});
final ReadCountCollection rcc = new ReadCountCollection(phonyTargets, phonySamples, counts);
return new ImmutablePair<>(rcc, gcContentByTarget);
}
use of org.apache.commons.math3.util.Pair in project gatk by broadinstitute.
the class HetPulldownCalculator method isPileupHetCompatible.
/**
* Returns true if the distribution of major and other base-pair counts from a pileup at a locus is compatible with
* allele fraction of 0.5.
*
* <p>
* Compatibility is defined by a p-value threshold. That is, compute the two-sided p-value of observing
* a number of major read counts out of a total number of reads, assuming the given heterozygous
* allele fraction. If the p-value is less than the given threshold, then reject the null hypothesis
* that the heterozygous allele fraction is 0.5 (i.e., SNP is likely to be homozygous) and return false,
* otherwise return true.
* </p>
* @param baseCounts base-pair counts
* @param totalBaseCount total base-pair counts (excluding N, etc.)
* @param pvalThreshold p-value threshold for two-sided binomial test (should be in [0, 1], but no check is performed)
* @return boolean compatibility with heterozygous allele fraction
*/
@VisibleForTesting
protected static boolean isPileupHetCompatible(final Nucleotide.Counter baseCounts, final int totalBaseCount, final double pvalThreshold) {
final int majorReadCount = Arrays.stream(BASES).mapToInt(b -> (int) baseCounts.get(b)).max().getAsInt();
if (majorReadCount == 0 || totalBaseCount - majorReadCount == 0) {
return false;
}
final double pval = new BinomialTest().binomialTest(totalBaseCount, majorReadCount, HET_ALLELE_FRACTION, AlternativeHypothesis.TWO_SIDED);
return pval >= pvalThreshold;
}
use of org.apache.commons.math3.util.Pair in project gatk by broadinstitute.
the class TargetCoverageSexGenotypeCalculator method processReadCountsAndTargets.
/**
* Processes raw read counts and targets:
* <dl>
* <dt> If more than one sample is present in the collection, filters out fully uncovered targets
* from read counts and removes the uncovered targets from the target list</dt>
*
* <dt> Otherwise, does nothing and warns the user
* </dt>
* </dl>
*
* @param rawReadCounts raw read count collection
* @param targetList user provided target list
* @return pair of processed read counts and targets
*/
private ImmutablePair<ReadCountCollection, List<Target>> processReadCountsAndTargets(@Nonnull final ReadCountCollection rawReadCounts, @Nonnull final List<Target> targetList) {
final ReadCountCollection finalReadCounts;
final List<Target> finalTargetList;
/* remove totally uncovered targets */
if (rawReadCounts.columnNames().size() > 1) {
finalReadCounts = ReadCountCollectionUtils.removeTotallyUncoveredTargets(rawReadCounts, logger);
final Set<Target> targetSetFromProcessedReadCounts = new HashSet<>(finalReadCounts.targets());
finalTargetList = targetList.stream().filter(targetSetFromProcessedReadCounts::contains).collect(Collectors.toList());
} else {
final long numUncoveredTargets = rawReadCounts.records().stream().filter(rec -> (int) rec.getDouble(0) == 0).count();
final long numAllTargets = rawReadCounts.targets().size();
logger.info("Since only one sample is given for genotyping, the user is responsible for asserting" + " the aptitude of targets. Fully uncovered (irrelevant) targets can not be automatically" + " identified (total targets: " + numAllTargets + ", uncovered targets: " + numUncoveredTargets + ")");
finalReadCounts = rawReadCounts;
finalTargetList = targetList;
}
return ImmutablePair.of(finalReadCounts, finalTargetList);
}
use of org.apache.commons.math3.util.Pair in project gatk-protected by broadinstitute.
the class JointAFCRSegmenterUnitTest method testSegmentation.
@Test
public void testSegmentation() {
final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(563));
// probability that a datum is a het i.e. #hets / (#hets + #targets)
final double hetProportion = 0.25;
final List<Double> trueWeights = Arrays.asList(0.2, 0.5, 0.3);
final double[] trueMinorAlleleFractions = new double[] { 0.12, 0.32, 0.5 };
final double[] trueLog2CopyRatios = new double[] { -2.0, 0.0, 1.7 };
final List<AFCRHiddenState> trueJointStates = IntStream.range(0, trueLog2CopyRatios.length).mapToObj(n -> new AFCRHiddenState(trueMinorAlleleFractions[n], trueLog2CopyRatios[n])).collect(Collectors.toList());
final double trueMemoryLength = 1e5;
final double trueCauchyWidth = 0.2;
final int initialNumCRStates = 20;
final int initialNumAFStates = 20;
final AlleleFractionGlobalParameters trueAFParams = new AlleleFractionGlobalParameters(1.0, 0.01, 0.01);
final JointAFCRHMM trueJointModel = new JointAFCRHMM(trueJointStates, trueWeights, trueMemoryLength, trueAFParams, AllelicPanelOfNormals.EMPTY_PON, trueCauchyWidth);
// generate joint truth
final int chainLength = 10000;
final List<SimpleInterval> positions = CopyRatioSegmenterUnitTest.randomPositions("chr1", chainLength, rng, trueMemoryLength / 4);
final List<Integer> trueHiddenStates = trueJointModel.generateHiddenStateChain(positions);
final List<AFCRHiddenState> trueAFCRSequence = trueHiddenStates.stream().map(trueJointModel::getHiddenStateValue).collect(Collectors.toList());
final double[] trueCopyRatioSequence = trueAFCRSequence.stream().mapToDouble(AFCRHiddenState::getLog2CopyRatio).toArray();
final double[] trueAlleleFractionSequence = trueAFCRSequence.stream().mapToDouble(AFCRHiddenState::getMinorAlleleFraction).toArray();
// generate separate af and cr data
final GammaDistribution biasGenerator = AlleleFractionSegmenterUnitTest.getGammaDistribution(trueAFParams, rng);
final double outlierProbability = trueAFParams.getOutlierProbability();
final AllelicCountCollection afData = new AllelicCountCollection();
final List<Double> crData = new ArrayList<>();
final List<Target> crTargets = new ArrayList<>();
for (int n = 0; n < positions.size(); n++) {
final SimpleInterval position = positions.get(n);
final AFCRHiddenState jointState = trueAFCRSequence.get(n);
final double minorFraction = jointState.getMinorAlleleFraction();
final double log2CopyRatio = jointState.getLog2CopyRatio();
if (rng.nextDouble() < hetProportion) {
// het datum
afData.add(AlleleFractionSegmenterUnitTest.generateAllelicCount(minorFraction, position, rng, biasGenerator, outlierProbability));
} else {
//target datum
crTargets.add(new Target(position));
crData.add(CopyRatioSegmenterUnitTest.generateData(trueCauchyWidth, log2CopyRatio, rng));
}
}
final ReadCountCollection rcc = new ReadCountCollection(crTargets, Arrays.asList("SAMPLE"), new Array2DRowRealMatrix(crData.stream().mapToDouble(x -> x).toArray()));
final JointAFCRSegmenter segmenter = JointAFCRSegmenter.createJointSegmenter(initialNumCRStates, rcc, initialNumAFStates, afData, AllelicPanelOfNormals.EMPTY_PON);
final TargetCollection<SimpleInterval> tc = new HashedListTargetCollection<>(positions);
final List<Pair<SimpleInterval, AFCRHiddenState>> segmentation = segmenter.findSegments();
final List<ACNVModeledSegment> jointSegments = segmentation.stream().map(pair -> {
final SimpleInterval position = pair.getLeft();
final AFCRHiddenState jointState = pair.getRight();
final PosteriorSummary crSummary = PerformJointSegmentation.errorlessPosterior(jointState.getLog2CopyRatio());
final PosteriorSummary afSummary = PerformJointSegmentation.errorlessPosterior(jointState.getMinorAlleleFraction());
return new ACNVModeledSegment(position, crSummary, afSummary);
}).collect(Collectors.toList());
final double[] segmentCopyRatios = jointSegments.stream().flatMap(s -> Collections.nCopies(tc.targetCount(s.getInterval()), s.getSegmentMeanPosteriorSummary().getCenter()).stream()).mapToDouble(x -> x).toArray();
final double[] segmentMinorFractions = jointSegments.stream().flatMap(s -> Collections.nCopies(tc.targetCount(s.getInterval()), s.getMinorAlleleFractionPosteriorSummary().getCenter()).stream()).mapToDouble(x -> x).toArray();
final double averageMinorFractionError = Arrays.stream(MathArrays.ebeSubtract(trueAlleleFractionSequence, segmentMinorFractions)).map(Math::abs).average().getAsDouble();
final double averageCopyRatioError = Arrays.stream(MathArrays.ebeSubtract(trueCopyRatioSequence, segmentCopyRatios)).map(Math::abs).average().getAsDouble();
Assert.assertEquals(averageMinorFractionError, 0, 0.04);
Assert.assertEquals(averageCopyRatioError, 0, 0.04);
}
use of org.apache.commons.math3.util.Pair in project gatk-protected by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method testSubsetTargetToUsableOnes.
@Test(dataProvider = "readCountAndPercentileData")
public void testSubsetTargetToUsableOnes(final ReadCountCollection readCount, final double percentile) {
final Median median = new Median();
final RealMatrix counts = readCount.counts();
final double[] targetMedians = IntStream.range(0, counts.getRowDimension()).mapToDouble(i -> median.evaluate(counts.getRow(i))).toArray();
final double threshold = new Percentile(percentile).evaluate(targetMedians);
final Boolean[] toBeKept = DoubleStream.of(targetMedians).mapToObj(d -> d >= threshold).toArray(Boolean[]::new);
final int toBeKeptCount = (int) Stream.of(toBeKept).filter(b -> b).count();
final Pair<ReadCountCollection, double[]> result = HDF5PCACoveragePoNCreationUtils.subsetReadCountsToUsableTargets(readCount, percentile, NULL_LOGGER);
Assert.assertEquals(result.getLeft().targets().size(), toBeKeptCount);
Assert.assertEquals(result.getRight().length, toBeKeptCount);
int nextIndex = 0;
for (int i = 0; i < toBeKept.length; i++) {
if (toBeKept[i]) {
int index = result.getLeft().targets().indexOf(readCount.targets().get(i));
Assert.assertEquals(index, nextIndex++);
Assert.assertEquals(counts.getRow(i), result.getLeft().counts().getRow(index));
Assert.assertEquals(result.getRight()[index], targetMedians[i]);
} else {
Assert.assertEquals(result.getLeft().targets().indexOf(readCount.targets().get(i)), -1);
}
}
}
Aggregations