use of org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor in project gatk-protected by broadinstitute.
the class PCATangentNormalizationUtils method composeTangentNormalizationInputMatrix.
/**
* Prepares the data to perform tangent normalization.
* <p>
* This is done by count group or column:
* <ol>
* </li>we divide counts by the column mean,</li>
* </li>then we transform value to their log_2,</li>
* </li>and finally we center them around the median.</li>
* </ol>
* </p>
*
* @param matrix input matrix.
* @return never {@code null}.
*/
private static RealMatrix composeTangentNormalizationInputMatrix(final RealMatrix matrix) {
final RealMatrix result = matrix.copy();
// step 1: divide by column means and log_2 transform
final double[] columnMeans = GATKProtectedMathUtils.columnMeans(matrix);
result.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(final int row, final int column, final double value) {
return truncatedLog2(value / columnMeans[column]);
}
});
// step 2: subtract column medians
final double[] columnMedians = IntStream.range(0, matrix.getColumnDimension()).mapToDouble(c -> new Median().evaluate(result.getColumn(c))).toArray();
result.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(final int row, final int column, final double value) {
return value - columnMedians[column];
}
});
return result;
}
use of org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor in project gatk by broadinstitute.
the class GCCorrector method correctCoverage.
/**
*
* @param inputCounts raw coverage before GC correction
* @param gcContentByTarget array of gc contents, one per target of the input
* @return GC-corrected coverage
*/
public static ReadCountCollection correctCoverage(final ReadCountCollection inputCounts, final double[] gcContentByTarget) {
// each column (sample) has its own GC bias curve, hence its own GC corrector
final List<GCCorrector> gcCorrectors = IntStream.range(0, inputCounts.columnNames().size()).mapToObj(n -> new GCCorrector(gcContentByTarget, inputCounts.counts().getColumnVector(n))).collect(Collectors.toList());
// gc correct a copy of the input counts in-place
final RealMatrix correctedCounts = inputCounts.counts().copy();
correctedCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(int target, int column, double coverage) {
return gcCorrectors.get(column).correctedCoverage(coverage, gcContentByTarget[target]);
}
});
// we would like the average correction factor to be 1.0 in the sense that average coverage before and after
// correction should be equal
final double[] columnNormalizationFactors = IntStream.range(0, inputCounts.columnNames().size()).mapToDouble(c -> inputCounts.counts().getColumnVector(c).getL1Norm() / correctedCounts.getColumnVector(c).getL1Norm()).toArray();
correctedCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(int target, int column, double coverage) {
return coverage * columnNormalizationFactors[column];
}
});
return new ReadCountCollection(inputCounts.targets(), inputCounts.columnNames(), correctedCounts);
}
use of org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor in project gatk-protected by broadinstitute.
the class NormalizeSomaticReadCountsIntegrationTest method assertBetaHatsRobustToOutliers.
/**
* Asserts that the calculation of beta hats is not significantly affected by zero-coverage outlier counts
* We perform this check by randomly setting some coverages to zero in copy ratio space (-infinity in log space).
* betaHats imputes 0 in log space (1 in copy ratio space) whenever coverage is below a certain low threshold
* and should thus be robust to this type of noise.
*/
private void assertBetaHatsRobustToOutliers(final ReadCountCollection preTangentNormalized, final File ponFile) {
try (final HDF5File ponReader = new HDF5File(ponFile)) {
final PCACoveragePoN pon = new HDF5PCACoveragePoN(ponReader);
final List<String> ponTargets = pon.getPanelTargetNames();
final RealMatrix input = reorderTargetsToPoNOrder(preTangentNormalized, ponTargets);
// randomly set some entries to zero in copy-ratio space (-infinity in log space)
final Random random = new Random(13);
final double noiseProportion = 0.01;
final RealMatrix noisyInput = input.copy();
noisyInput.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(final int row, final int column, final double value) {
return random.nextDouble() < noiseProportion ? Double.NEGATIVE_INFINITY : value;
}
});
final RealMatrix betaHats = PCATangentNormalizationUtils.calculateBetaHats(pon.getReducedPanelPInverseCounts(), input, PCATangentNormalizationUtils.EPSILON);
final RealMatrix noisyBetaHats = PCATangentNormalizationUtils.calculateBetaHats(pon.getReducedPanelPInverseCounts(), noisyInput, PCATangentNormalizationUtils.EPSILON);
final RealMatrix difference = betaHats.subtract(noisyBetaHats);
difference.walkInOptimizedOrder(new DefaultRealMatrixPreservingVisitor() {
@Override
public void visit(final int row, int column, double value) {
Assert.assertEquals(value, 0, 0.01);
}
});
}
}
use of org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor in project gatk-protected by broadinstitute.
the class HDF5LibraryUnitTest method createMatrixOfGaussianValues.
private RealMatrix createMatrixOfGaussianValues(int numRows, int numCols, final double mean, final double sigma) {
final RealMatrix bigCounts = new Array2DRowRealMatrix(numRows, numCols);
final RandomDataGenerator randomDataGenerator = new RandomDataGenerator();
randomDataGenerator.reSeed(337337337);
bigCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(int row, int column, double value) {
return randomDataGenerator.nextGaussian(mean, sigma);
}
});
return bigCounts;
}
use of org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor in project gatk by broadinstitute.
the class ReadCountCollectionUtils method imputeZeroCountsAsTargetMedians.
/**
* Impute zero counts to the median of non-zero values in the enclosing target row.
*
* <p>The imputation is done in-place, thus the input matrix is well be modified as a result of this call.</p>
*
* @param readCounts the input and output read-count matrix.
*/
public static void imputeZeroCountsAsTargetMedians(final ReadCountCollection readCounts, final Logger logger) {
final RealMatrix counts = readCounts.counts();
final int targetCount = counts.getRowDimension();
final Median medianCalculator = new Median();
int totalCounts = counts.getColumnDimension() * counts.getRowDimension();
// Get the number of zeroes contained in the counts.
final long totalZeroCounts = IntStream.range(0, targetCount).mapToLong(t -> DoubleStream.of(counts.getRow(t)).filter(c -> c == 0.0).count()).sum();
// Get the median of each row, not including any entries that are zero.
final double[] medians = IntStream.range(0, targetCount).mapToDouble(t -> medianCalculator.evaluate(DoubleStream.of(counts.getRow(t)).filter(c -> c != 0.0).toArray())).toArray();
// Change any zeros in the counts to the median for the row.
counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(final int row, final int column, final double value) {
return value != 0 ? value : medians[row];
}
});
if (totalZeroCounts > 0) {
final double totalZeroCountsPercentage = 100.0 * (totalZeroCounts / totalCounts);
logger.info(String.format("Some 0.0 counts (%d out of %d, %.2f%%) were imputed to their enclosing target " + "non-zero median", totalZeroCounts, totalZeroCounts, totalZeroCountsPercentage));
} else {
logger.info("No count is 0.0 thus no count needed to be imputed.");
}
}
Aggregations