use of org.apache.commons.math3.linear.RealMatrix in project gatk-protected by broadinstitute.
the class ReadCountCollectionUtils method removeColumnsWithTooManyZeros.
/**
* Remove columns that have too many counts equal to 0.
* <p>
* It will return a copy of the input read-count collection with such columns dropped.
* </p>
*
* @param readCounts the input read counts.
* @param maximumColumnZeros maximum number of counts equal to 0 per column tolerated.
* @return never {@code null}. It might be a reference to the input read-counts if there is
* is no column to be dropped.
*/
@VisibleForTesting
public static ReadCountCollection removeColumnsWithTooManyZeros(final ReadCountCollection readCounts, final int maximumColumnZeros, final boolean roundToInteger, final Logger logger) {
final RealMatrix counts = readCounts.counts();
final Set<String> columnsToKeep = IntStream.range(0, counts.getColumnDimension()).boxed().filter(i -> countZeroes(counts.getColumn(i), roundToInteger) <= maximumColumnZeros).map(i -> readCounts.columnNames().get(i)).collect(Collectors.toCollection(LinkedHashSet::new));
final int columnsToDropCount = readCounts.columnNames().size() - columnsToKeep.size();
if (columnsToDropCount == 0) {
logger.info(String.format("There were no columns with a large number of targets with zero counts " + "(<= %d of %d) to drop", maximumColumnZeros, readCounts.targets().size()));
return readCounts;
} else if (columnsToDropCount == readCounts.columnNames().size()) {
throw new UserException.BadInput("The number of zeros per count column is too large resulting in all count " + "columns to be dropped");
} else {
final double droppedPercentage = ((double) (columnsToDropCount) / readCounts.columnNames().size()) * 100;
logger.info(String.format("Some counts columns dropped (%d out of %d, %.2f%%) as they had too many targets with zeros (> %d of %d)", columnsToDropCount, readCounts.columnNames().size(), droppedPercentage, maximumColumnZeros, readCounts.targets().size()));
return readCounts.subsetColumns(columnsToKeep);
}
}
use of org.apache.commons.math3.linear.RealMatrix in project gatk-protected by broadinstitute.
the class XHMMSegmentCallerBase method standardizeByTarget.
/**
* Standardize read counts (per-target).
* Note: modification is done in-place.
*
* @param counts original read counts
*/
private void standardizeByTarget(final RealMatrix counts) {
final double[] rowMeans = GATKProtectedMathUtils.rowMeans(counts);
final double[] rowStdDev = GATKProtectedMathUtils.rowStdDevs(counts);
counts.walkInColumnOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(final int row, final int column, final double value) {
return (value - rowMeans[row]) / rowStdDev[row];
}
});
}
use of org.apache.commons.math3.linear.RealMatrix in project gatk-protected by broadinstitute.
the class XHMMSegmentCallerBase method standardizeBySample.
/**
* Standardize read counts (per-sample).
* Note: modification is done in-place.
*
* @param counts original read counts
*/
private void standardizeBySample(final RealMatrix counts) {
final double[] columnMeans = GATKProtectedMathUtils.columnMeans(counts);
final double[] columnStdDev = GATKProtectedMathUtils.columnStdDevs(counts);
counts.walkInColumnOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(int row, int column, double value) {
return (value - columnMeans[column]) / columnStdDev[column];
}
});
}
use of org.apache.commons.math3.linear.RealMatrix in project gatk-protected by broadinstitute.
the class GCCorrector method correctCoverage.
/**
*
* @param inputCounts raw coverage before GC correction
* @param gcContentByTarget array of gc contents, one per target of the input
* @return GC-corrected coverage
*/
public static ReadCountCollection correctCoverage(final ReadCountCollection inputCounts, final double[] gcContentByTarget) {
// each column (sample) has its own GC bias curve, hence its own GC corrector
final List<GCCorrector> gcCorrectors = IntStream.range(0, inputCounts.columnNames().size()).mapToObj(n -> new GCCorrector(gcContentByTarget, inputCounts.counts().getColumnVector(n))).collect(Collectors.toList());
// gc correct a copy of the input counts in-place
final RealMatrix correctedCounts = inputCounts.counts().copy();
correctedCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(int target, int column, double coverage) {
return gcCorrectors.get(column).correctedCoverage(coverage, gcContentByTarget[target]);
}
});
// we would like the average correction factor to be 1.0 in the sense that average coverage before and after
// correction should be equal
final double[] columnNormalizationFactors = IntStream.range(0, inputCounts.columnNames().size()).mapToDouble(c -> inputCounts.counts().getColumnVector(c).getL1Norm() / correctedCounts.getColumnVector(c).getL1Norm()).toArray();
correctedCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(int target, int column, double coverage) {
return coverage * columnNormalizationFactors[column];
}
});
return new ReadCountCollection(inputCounts.targets(), inputCounts.columnNames(), correctedCounts);
}
use of org.apache.commons.math3.linear.RealMatrix in project gatk-protected by broadinstitute.
the class SNPSegmenter method writeSegmentFile.
/**
* Write segment file based on maximum-likelihood estimates of the minor allele fraction at SNP sites,
* assuming the specified allelic bias. These estimates are converted to target coverages,
* which are written to a temporary file and then passed to {@link RCBSSegmenter}.
* @param snps TargetCollection of allelic counts at SNP sites
* @param sampleName sample name
* @param outputFile segment file to write to and return
* @param allelicBias allelic bias to use in estimate of minor allele fraction
*/
public static void writeSegmentFile(final TargetCollection<AllelicCount> snps, final String sampleName, final File outputFile, final double allelicBias) {
Utils.validateArg(snps.totalSize() > 0, "Must have a positive number of SNPs to perform SNP segmentation.");
try {
final File targetsFromSNPCountsFile = File.createTempFile("targets-from-snps", ".tsv");
final List<Target> targets = snps.targets().stream().map(ac -> new Target(name(ac), ac.getInterval())).collect(Collectors.toList());
final RealMatrix minorAlleleFractions = new Array2DRowRealMatrix(snps.targetCount(), 1);
minorAlleleFractions.setColumn(0, snps.targets().stream().mapToDouble(ac -> ac.estimateMinorAlleleFraction(allelicBias)).toArray());
ReadCountCollectionUtils.write(targetsFromSNPCountsFile, new ReadCountCollection(targets, Collections.singletonList(sampleName), minorAlleleFractions));
//segment SNPs based on observed log_2 minor allele fraction (log_2 is applied in CBS.R)
RCBSSegmenter.writeSegmentFile(sampleName, targetsFromSNPCountsFile.getAbsolutePath(), outputFile.getAbsolutePath(), false);
} catch (final IOException e) {
throw new UserException.CouldNotCreateOutputFile("Could not create temporary output file during " + "SNP segmentation.", e);
}
}
Aggregations