use of org.apache.commons.math3.fraction.Fraction in project gatk-protected by broadinstitute.
the class SNPSegmenter method writeSegmentFile.
/**
* Write segment file based on maximum-likelihood estimates of the minor allele fraction at SNP sites,
* assuming the specified allelic bias. These estimates are converted to target coverages,
* which are written to a temporary file and then passed to {@link RCBSSegmenter}.
* @param snps TargetCollection of allelic counts at SNP sites
* @param sampleName sample name
* @param outputFile segment file to write to and return
* @param allelicBias allelic bias to use in estimate of minor allele fraction
*/
public static void writeSegmentFile(final TargetCollection<AllelicCount> snps, final String sampleName, final File outputFile, final double allelicBias) {
Utils.validateArg(snps.totalSize() > 0, "Must have a positive number of SNPs to perform SNP segmentation.");
try {
final File targetsFromSNPCountsFile = File.createTempFile("targets-from-snps", ".tsv");
final List<Target> targets = snps.targets().stream().map(ac -> new Target(name(ac), ac.getInterval())).collect(Collectors.toList());
final RealMatrix minorAlleleFractions = new Array2DRowRealMatrix(snps.targetCount(), 1);
minorAlleleFractions.setColumn(0, snps.targets().stream().mapToDouble(ac -> ac.estimateMinorAlleleFraction(allelicBias)).toArray());
ReadCountCollectionUtils.write(targetsFromSNPCountsFile, new ReadCountCollection(targets, Collections.singletonList(sampleName), minorAlleleFractions));
//segment SNPs based on observed log_2 minor allele fraction (log_2 is applied in CBS.R)
RCBSSegmenter.writeSegmentFile(sampleName, targetsFromSNPCountsFile.getAbsolutePath(), outputFile.getAbsolutePath(), false);
} catch (final IOException e) {
throw new UserException.CouldNotCreateOutputFile("Could not create temporary output file during " + "SNP segmentation.", e);
}
}
use of org.apache.commons.math3.fraction.Fraction in project gatk-protected by broadinstitute.
the class HetPulldownCalculator method isPileupHetCompatible.
/**
* Returns true if the distribution of major and other base-pair counts from a pileup at a locus is compatible with
* allele fraction of 0.5.
*
* <p>
* Compatibility is defined by a p-value threshold. That is, compute the two-sided p-value of observing
* a number of major read counts out of a total number of reads, assuming the given heterozygous
* allele fraction. If the p-value is less than the given threshold, then reject the null hypothesis
* that the heterozygous allele fraction is 0.5 (i.e., SNP is likely to be homozygous) and return false,
* otherwise return true.
* </p>
* @param baseCounts base-pair counts
* @param totalBaseCount total base-pair counts (excluding N, etc.)
* @param pvalThreshold p-value threshold for two-sided binomial test (should be in [0, 1], but no check is performed)
* @return boolean compatibility with heterozygous allele fraction
*/
@VisibleForTesting
protected static boolean isPileupHetCompatible(final Nucleotide.Counter baseCounts, final int totalBaseCount, final double pvalThreshold) {
final int majorReadCount = Arrays.stream(BASES).mapToInt(b -> (int) baseCounts.get(b)).max().getAsInt();
if (majorReadCount == 0 || totalBaseCount - majorReadCount == 0) {
return false;
}
final double pval = new BinomialTest().binomialTest(totalBaseCount, majorReadCount, HET_ALLELE_FRACTION, AlternativeHypothesis.TWO_SIDED);
return pval >= pvalThreshold;
}
use of org.apache.commons.math3.fraction.Fraction in project gatk by broadinstitute.
the class AlleleFractionSegmenterUnitTest method generateAllelicCount.
protected static AllelicCount generateAllelicCount(final double minorFraction, final SimpleInterval position, final RandomGenerator rng, final GammaDistribution biasGenerator, final double outlierProbability) {
final int numReads = 100;
final double bias = biasGenerator.sample();
//flip a coin to decide alt minor (alt fraction = minor fraction) or ref minor (alt fraction = 1 - minor fraction)
final double altFraction = rng.nextDouble() < 0.5 ? minorFraction : 1 - minorFraction;
//the probability of an alt read is the alt fraction modified by the bias or, in the case of an outlier, random
final double pAlt = rng.nextDouble() < outlierProbability ? rng.nextDouble() : altFraction / (altFraction + (1 - altFraction) * bias);
final int numAltReads = new BinomialDistribution(rng, numReads, pAlt).sample();
final int numRefReads = numReads - numAltReads;
return new AllelicCount(position, numAltReads, numRefReads);
}
use of org.apache.commons.math3.fraction.Fraction in project incubator-systemml by apache.
the class RandSPInstruction method generateSample.
/**
* Helper function to construct a sample.
*
* @param sec spark execution context
*/
private void generateSample(SparkExecutionContext sec) {
long lrows = sec.getScalarInput(rows).getLongValue();
if (maxValue < lrows && !replace)
throw new DMLRuntimeException("Sample (size=" + rows + ") larger than population (size=" + maxValue + ") can only be generated with replacement.");
if (LOG.isTraceEnabled())
LOG.trace("Process RandSPInstruction sample with range=" + maxValue + ", size=" + lrows + ", replace=" + replace + ", seed=" + seed);
// sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time.
double fraction = SamplingUtils.computeFractionForSampleSize((int) lrows, UtilFunctions.toLong(maxValue), replace);
Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(seed);
// divide the population range across numPartitions by creating SampleTasks
double hdfsBlockSize = InfrastructureAnalyzer.getHDFSBlockSize();
long outputSize = MatrixBlock.estimateSizeDenseInMemory(lrows, 1);
int numPartitions = (int) Math.ceil((double) outputSize / hdfsBlockSize);
long partitionSize = (long) Math.ceil(maxValue / numPartitions);
ArrayList<SampleTask> offsets = new ArrayList<>();
long st = 1;
while (st <= maxValue) {
SampleTask s = new SampleTask();
s.range_start = st;
s.seed = bigrand.nextLong();
offsets.add(s);
st = st + partitionSize;
}
JavaRDD<SampleTask> offsetRDD = sec.getSparkContext().parallelize(offsets, numPartitions);
// Construct the sample in a distributed manner
JavaRDD<Double> rdd = offsetRDD.flatMap((new GenerateSampleBlock(replace, fraction, (long) maxValue, partitionSize)));
// Randomize the sampled elements
JavaRDD<Double> randomizedRDD = rdd.mapToPair(new AttachRandom()).sortByKey().values();
// Trim the sampled list to required size & attach matrix indexes to randomized elements
JavaPairRDD<MatrixIndexes, MatrixCell> miRDD = randomizedRDD.zipWithIndex().filter(new TrimSample(lrows)).mapToPair(new Double2MatrixCell());
MatrixCharacteristics mcOut = new MatrixCharacteristics(lrows, 1, rowsInBlock, colsInBlock, lrows);
// Construct BinaryBlock representation
JavaPairRDD<MatrixIndexes, MatrixBlock> mbRDD = RDDConverterUtils.binaryCellToBinaryBlock(sec.getSparkContext(), miRDD, mcOut, true);
sec.getMatrixCharacteristics(output.getName()).setNonZeros(lrows);
sec.setRDDHandleForVariable(output.getName(), mbRDD);
}
use of org.apache.commons.math3.fraction.Fraction in project systemml by apache.
the class RandSPInstruction method generateSample.
/**
* Helper function to construct a sample.
*
* @param sec spark execution context
*/
private void generateSample(SparkExecutionContext sec) {
long lrows = sec.getScalarInput(rows).getLongValue();
if (maxValue < lrows && !replace)
throw new DMLRuntimeException("Sample (size=" + rows + ") larger than population (size=" + maxValue + ") can only be generated with replacement.");
if (LOG.isTraceEnabled())
LOG.trace("Process RandSPInstruction sample with range=" + maxValue + ", size=" + lrows + ", replace=" + replace + ", seed=" + seed);
// sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time.
double fraction = SamplingUtils.computeFractionForSampleSize((int) lrows, UtilFunctions.toLong(maxValue), replace);
Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(seed);
// divide the population range across numPartitions by creating SampleTasks
double hdfsBlockSize = InfrastructureAnalyzer.getHDFSBlockSize();
long outputSize = MatrixBlock.estimateSizeDenseInMemory(lrows, 1);
int numPartitions = (int) Math.ceil((double) outputSize / hdfsBlockSize);
long partitionSize = (long) Math.ceil(maxValue / numPartitions);
ArrayList<SampleTask> offsets = new ArrayList<>();
long st = 1;
while (st <= maxValue) {
SampleTask s = new SampleTask();
s.range_start = st;
s.seed = bigrand.nextLong();
offsets.add(s);
st = st + partitionSize;
}
JavaRDD<SampleTask> offsetRDD = sec.getSparkContext().parallelize(offsets, numPartitions);
// Construct the sample in a distributed manner
JavaRDD<Double> rdd = offsetRDD.flatMap((new GenerateSampleBlock(replace, fraction, (long) maxValue, partitionSize)));
// Randomize the sampled elements
JavaRDD<Double> randomizedRDD = rdd.mapToPair(new AttachRandom()).sortByKey().values();
// Trim the sampled list to required size & attach matrix indexes to randomized elements
JavaPairRDD<MatrixIndexes, MatrixCell> miRDD = randomizedRDD.zipWithIndex().filter(new TrimSample(lrows)).mapToPair(new Double2MatrixCell());
MatrixCharacteristics mcOut = new MatrixCharacteristics(lrows, 1, rowsInBlock, colsInBlock, lrows);
// Construct BinaryBlock representation
JavaPairRDD<MatrixIndexes, MatrixBlock> mbRDD = RDDConverterUtils.binaryCellToBinaryBlock(sec.getSparkContext(), miRDD, mcOut, true);
sec.getMatrixCharacteristics(output.getName()).setNonZeros(lrows);
sec.setRDDHandleForVariable(output.getName(), mbRDD);
}
Aggregations