use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk-protected by broadinstitute.
the class CalculateContamination method findConfidentHomAltSites.
private static List<PileupSummary> findConfidentHomAltSites(List<PileupSummary> sites) {
if (sites.isEmpty()) {
return new ArrayList<>();
}
final TargetCollection<PileupSummary> tc = new HashedListTargetCollection<>(sites);
final double averageCoverage = sites.stream().mapToInt(PileupSummary::getTotalCount).average().getAsDouble();
final List<Double> smoothedCopyRatios = new ArrayList<>();
final List<Double> hetRatios = new ArrayList<>();
for (final PileupSummary site : sites) {
final SimpleInterval nearbySpan = new SimpleInterval(site.getContig(), Math.max(1, site.getStart() - CNV_SCALE), site.getEnd() + CNV_SCALE);
final List<PileupSummary> nearbySites = tc.targets(nearbySpan);
final double averageNearbyCopyRatio = nearbySites.stream().mapToDouble(s -> s.getTotalCount() / averageCoverage).average().orElseGet(() -> 0);
smoothedCopyRatios.add(averageNearbyCopyRatio);
final double expectedNumberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAlleleFrequency).map(x -> 2 * x * (1 - x)).sum();
final long numberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAltFraction).filter(x -> 0.4 < x && x < 0.6).count();
final double hetRatio = numberOfNearbyHets / expectedNumberOfNearbyHets;
hetRatios.add(hetRatio);
}
final double medianSmoothedCopyRatio = new Median().evaluate(smoothedCopyRatios.stream().mapToDouble(x -> x).toArray());
final List<Integer> indicesWithAnomalousCopyRatio = IntStream.range(0, sites.size()).filter(n -> smoothedCopyRatios.get(n) < 0.8 * medianSmoothedCopyRatio || smoothedCopyRatios.get(n) > 2 * medianSmoothedCopyRatio).boxed().collect(Collectors.toList());
final double meanHetRatio = hetRatios.stream().mapToDouble(x -> x).average().getAsDouble();
final List<Integer> indicesWithLossOfHeterozygosity = IntStream.range(0, sites.size()).filter(n -> hetRatios.get(n) < meanHetRatio * 0.5).boxed().collect(Collectors.toList());
//TODO: as extra security, filter out sites that are near too many hom alts
logger.info(String.format("Excluding %d sites with low or high copy ratio and %d sites with potential loss of heterozygosity", indicesWithAnomalousCopyRatio.size(), indicesWithLossOfHeterozygosity.size()));
logger.info(String.format("The average ratio of hets within distance %d to theoretically expected number of hets is %.3f", CNV_SCALE, meanHetRatio));
final Set<Integer> badSites = new TreeSet<>();
badSites.addAll(indicesWithAnomalousCopyRatio);
badSites.addAll(indicesWithLossOfHeterozygosity);
return IntStream.range(0, sites.size()).filter(n -> !badSites.contains(n)).mapToObj(sites::get).filter(s -> s.getAltFraction() > 0.8).collect(Collectors.toList());
}
use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk by broadinstitute.
the class FisherExactTest method twoSidedPValue.
/**
* Computes the 2-sided pvalue of the Fisher's exact test on a normalized table that ensures that the sum of
* all four entries is less than 2 * 200.
*/
public static double twoSidedPValue(final int[][] normalizedTable) {
Utils.nonNull(normalizedTable);
Utils.validateArg(normalizedTable.length == 2, () -> "input must be 2x2 " + Arrays.deepToString(normalizedTable));
Utils.validateArg(normalizedTable[0] != null && normalizedTable[0].length == 2, () -> "input must be 2x2 " + Arrays.deepToString(normalizedTable));
Utils.validateArg(normalizedTable[1] != null && normalizedTable[1].length == 2, () -> "input must be 2x2 " + Arrays.deepToString(normalizedTable));
//Note: this implementation follows the one in R base package
final int[][] x = normalizedTable;
final int m = x[0][0] + x[0][1];
final int n = x[1][0] + x[1][1];
final int k = x[0][0] + x[1][0];
final int lo = Math.max(0, k - n);
final int hi = Math.min(k, m);
final IndexRange support = new IndexRange(lo, hi + 1);
if (support.size() <= 1) {
//special case, support has only one value
return 1.0;
}
final AbstractIntegerDistribution dist = new HypergeometricDistribution(null, m + n, m, k);
final double[] logds = support.mapToDouble(dist::logProbability);
final double threshold = logds[x[0][0] - lo] * REL_ERR;
final double[] log10ds = DoubleStream.of(logds).filter(d -> d <= threshold).map(MathUtils::logToLog10).toArray();
final double pValue = MathUtils.sumLog10(log10ds);
// min is necessary as numerical precision can result in pValue being slightly greater than 1.0
return Math.min(pValue, 1.0);
}
use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk-protected by broadinstitute.
the class SomaticLikelihoodsEngineUnitTest method testEvidence.
@Test
public void testEvidence() {
// one exact limit for the evidence is when the likelihoods of each read are so peaked (i.e. the most likely allele
// of each read is much likelier than all other alleles) that the sum over latent read-to-allele assignments
// (that is, over the indicator z in the notes) is dominated by the max-likelihood allele configuration
// and thus the evidence reduces to exactly integrating out the Dirichlet allele fractions
final double[] prior = new double[] { 1, 2 };
final RealMatrix log10Likelihoods = new Array2DRowRealMatrix(2, 4);
log10Likelihoods.setRow(0, new double[] { 0.1, 4.0, 3.0, -10 });
log10Likelihoods.setRow(1, new double[] { -12, -9, -5.0, 0.5 });
final double calculatedLog10Evidence = SomaticLikelihoodsEngine.log10Evidence(log10Likelihoods, prior);
final double[] maxLikelihoodCounts = new double[] { 3, 1 };
final double expectedLog10Evidence = SomaticLikelihoodsEngine.log10DirichletNormalization(prior) - SomaticLikelihoodsEngine.log10DirichletNormalization(MathArrays.ebeAdd(prior, maxLikelihoodCounts)) + new IndexRange(0, log10Likelihoods.getColumnDimension()).sum(read -> log10Likelihoods.getColumnVector(read).getMaxValue());
Assert.assertEquals(calculatedLog10Evidence, expectedLog10Evidence, 1e-5);
// when there's just one read we can calculate the likelihood exactly
final double[] prior2 = new double[] { 1, 2 };
final RealMatrix log10Likelihoods2 = new Array2DRowRealMatrix(2, 1);
log10Likelihoods2.setRow(0, new double[] { 0.1 });
log10Likelihoods2.setRow(1, new double[] { 0.5 });
final double calculatedLog10Evidence2 = SomaticLikelihoodsEngine.log10Evidence(log10Likelihoods2, prior2);
final double[] delta0 = new double[] { 1, 0 };
final double[] delta1 = new double[] { 0, 1 };
final double expectedLog10Evidence2 = MathUtils.log10SumLog10(log10Likelihoods2.getEntry(0, 0) + SomaticLikelihoodsEngine.log10DirichletNormalization(prior2) - SomaticLikelihoodsEngine.log10DirichletNormalization(MathArrays.ebeAdd(prior2, delta0)), +log10Likelihoods2.getEntry(1, 0) + SomaticLikelihoodsEngine.log10DirichletNormalization(prior2) - SomaticLikelihoodsEngine.log10DirichletNormalization(MathArrays.ebeAdd(prior2, delta1)));
Assert.assertEquals(calculatedLog10Evidence2, expectedLog10Evidence2, 0.05);
}
use of org.apache.commons.math3.stat.descriptive.summary.Sum in project knime-core by knime.
the class LogRegPredictor method getCells.
/**
* {@inheritDoc}
*/
@Override
public DataCell[] getCells(final DataRow row) {
if (hasMissingValues(row)) {
return createMissingOutput();
}
final MissingHandling missingHandling = new MissingHandling(true);
DataCell[] cells = m_includeProbs ? new DataCell[1 + m_targetDomainValuesCount] : new DataCell[1];
Arrays.fill(cells, new IntCell(0));
// column vector
final RealMatrix x = MatrixUtils.createRealMatrix(1, m_parameters.size());
for (int i = 0; i < m_parameters.size(); i++) {
String parameter = m_parameters.get(i);
String predictor = null;
String value = null;
boolean rowIsEmpty = true;
for (final Iterator<String> iter = m_predictors.iterator(); iter.hasNext(); ) {
predictor = iter.next();
value = m_ppMatrix.getValue(parameter, predictor, null);
if (null != value) {
rowIsEmpty = false;
break;
}
}
if (rowIsEmpty) {
x.setEntry(0, i, 1);
} else {
if (m_factors.contains(predictor)) {
List<DataCell> values = m_values.get(predictor);
DataCell cell = row.getCell(m_parameterI.get(parameter));
int index = values.indexOf(cell);
/* When building a general regression model, for each
categorical fields, there is one category used as the
default baseline and therefore it didn't show in the
ParameterList in PMML. This design for the training is fine,
but in the prediction, when the input of Employment is
the default baseline, the parameters should all be 0.
See the commit message for an example and more details.
*/
if (index > 0) {
x.setEntry(0, i + index - 1, 1);
i += values.size() - 2;
}
} else if (m_baseLabelToColName.containsKey(parameter) && m_vectorLengths.containsKey(m_baseLabelToColName.get(parameter))) {
final DataCell cell = row.getCell(m_parameterI.get(parameter));
Optional<NameAndIndex> vectorValue = VectorHandling.parse(predictor);
if (vectorValue.isPresent()) {
int j = vectorValue.get().getIndex();
value = m_ppMatrix.getValue(parameter, predictor, null);
double exponent = Integer.valueOf(value);
double radix = RegressionTrainingRow.getValue(cell, j, missingHandling);
x.setEntry(0, i, Math.pow(radix, exponent));
}
} else {
DataCell cell = row.getCell(m_parameterI.get(parameter));
double radix = ((DoubleValue) cell).getDoubleValue();
double exponent = Integer.valueOf(value);
x.setEntry(0, i, Math.pow(radix, exponent));
}
}
}
// column vector
RealMatrix r = x.multiply(m_beta);
// determine the column with highest probability
int maxIndex = 0;
double maxValue = r.getEntry(0, 0);
for (int i = 1; i < r.getColumnDimension(); i++) {
if (r.getEntry(0, i) > maxValue) {
maxValue = r.getEntry(0, i);
maxIndex = i;
}
}
if (m_includeProbs) {
// compute probabilities of the target categories
for (int i = 0; i < m_targetCategories.size(); i++) {
// test if calculation would overflow
boolean overflow = false;
for (int k = 0; k < r.getColumnDimension(); k++) {
if ((r.getEntry(0, k) - r.getEntry(0, i)) > 700) {
overflow = true;
}
}
if (!overflow) {
double sum = 0;
for (int k = 0; k < r.getColumnDimension(); k++) {
sum += Math.exp(r.getEntry(0, k) - r.getEntry(0, i));
}
cells[m_targetCategoryIndex.get(i)] = new DoubleCell(1.0 / sum);
} else {
cells[m_targetCategoryIndex.get(i)] = new DoubleCell(0);
}
}
}
// the last cell is the prediction
cells[cells.length - 1] = m_targetCategories.get(maxIndex);
return cells;
}
use of org.apache.commons.math3.stat.descriptive.summary.Sum in project knime-core by knime.
the class TreeNominalColumnData method calcBestSplitClassificationBinaryPCA.
/**
* Implements the approach proposed by Coppersmith et al. (1999) in their paper
* "Partitioning Nominal Attributes in Decision Trees"
*
* @param membershipController
* @param rowWeights
* @param targetPriors
* @param targetColumn
* @param impCriterion
* @param nomVals
* @param targetVals
* @param originalIndexInColumnList
* @return the best binary split candidate or null if there is no valid split with positive gain
*/
private NominalBinarySplitCandidate calcBestSplitClassificationBinaryPCA(final ColumnMemberships columnMemberships, final ClassificationPriors targetPriors, final TreeTargetNominalColumnData targetColumn, final IImpurity impCriterion, final NominalValueRepresentation[] nomVals, final NominalValueRepresentation[] targetVals, final RandomData rd) {
final TreeEnsembleLearnerConfiguration config = getConfiguration();
final int minChildSize = config.getMinChildSize();
final boolean useXGBoostMissingValueHandling = config.getMissingValueHandling() == MissingValueHandling.XGBoost;
// The algorithm combines attribute values with the same class probabilities into a single attribute
// therefore it is necessary to track the known classProbabilities
final LinkedHashMap<ClassProbabilityVector, CombinedAttributeValues> combinedAttValsMap = new LinkedHashMap<ClassProbabilityVector, CombinedAttributeValues>();
columnMemberships.next();
double totalWeight = 0.0;
boolean branchContainsMissingValues = containsMissingValues();
int start = 0;
final int lengthNonMissing = containsMissingValues() ? nomVals.length - 1 : nomVals.length;
final int attToConsider = useXGBoostMissingValueHandling ? nomVals.length : lengthNonMissing;
for (int att = 0; att < lengthNonMissing; /*attToConsider*/
att++) {
int end = start + m_nominalValueCounts[att];
double attWeight = 0.0;
final double[] classFrequencies = new double[targetVals.length];
boolean reachedEnd = false;
for (int index = columnMemberships.getIndexInColumn(); index < end; index = columnMemberships.getIndexInColumn()) {
double weight = columnMemberships.getRowWeight();
assert weight > EPSILON : "Instances in columnMemberships must have weights larger than EPSILON.";
int instanceClass = targetColumn.getValueFor(columnMemberships.getOriginalIndex());
classFrequencies[instanceClass] += weight;
attWeight += weight;
totalWeight += weight;
if (!columnMemberships.next()) {
// reached end of columnMemberships
reachedEnd = true;
if (att == nomVals.length - 1) {
// if the column contains no missing values, the last possible nominal value is
// not the missing value and therefore branchContainsMissingValues needs to be false
branchContainsMissingValues = branchContainsMissingValues && true;
}
break;
}
}
start = end;
if (attWeight < EPSILON) {
// attribute value did not occur in this branch or sample
continue;
}
final double[] classProbabilities = new double[targetVals.length];
for (int i = 0; i < classProbabilities.length; i++) {
classProbabilities[i] = truncateDouble(8, classFrequencies[i] / attWeight);
}
CombinedAttributeValues attVal = new CombinedAttributeValues(classFrequencies, classProbabilities, attWeight, nomVals[att]);
ClassProbabilityVector classProbabilityVector = new ClassProbabilityVector(classProbabilities);
CombinedAttributeValues knownAttVal = combinedAttValsMap.get(classProbabilityVector);
if (knownAttVal == null) {
combinedAttValsMap.put(classProbabilityVector, attVal);
} else {
knownAttVal.combineAttributeValues(attVal);
}
if (reachedEnd) {
break;
}
}
// account for missing values and their weight
double missingWeight = 0.0;
double[] missingClassCounts = null;
// otherwise the current indexInColumn won't be larger than start
if (columnMemberships.getIndexInColumn() >= start) {
missingClassCounts = new double[targetVals.length];
do {
final double recordWeight = columnMemberships.getRowWeight();
final int recordClass = targetColumn.getValueFor(columnMemberships.getOriginalIndex());
missingWeight += recordWeight;
missingClassCounts[recordClass] += recordWeight;
} while (columnMemberships.next());
}
if (missingWeight > EPSILON) {
branchContainsMissingValues = true;
} else {
branchContainsMissingValues = false;
}
ArrayList<CombinedAttributeValues> attValList = Lists.newArrayList(combinedAttValsMap.values());
CombinedAttributeValues[] attVals = combinedAttValsMap.values().toArray(new CombinedAttributeValues[combinedAttValsMap.size()]);
attVals = BinaryNominalSplitsPCA.calculatePCAOrdering(attVals, totalWeight, targetVals.length);
// EigenDecomposition failed
if (attVals == null) {
return null;
}
// Start searching for split candidates
final int highestBitPosition = containsMissingValues() ? nomVals.length - 2 : nomVals.length - 1;
final double[] binaryImpurityValues = new double[2];
final double[] binaryPartitionWeights = new double[2];
double sumRemainingWeights = totalWeight;
double sumCurrPartitionWeight = 0.0;
RealVector targetFrequenciesCurrentPartition = MatrixUtils.createRealVector(new double[targetVals.length]);
RealVector targetFrequenciesRemaining = MatrixUtils.createRealVector(new double[targetVals.length]);
for (CombinedAttributeValues attVal : attValList) {
targetFrequenciesRemaining = targetFrequenciesRemaining.add(attVal.m_classFrequencyVector);
}
BigInteger currPartitionBitMask = BigInteger.ZERO;
double bestPartitionGain = Double.NEGATIVE_INFINITY;
BigInteger bestPartitionMask = null;
boolean isBestSplitValid = false;
boolean missingsGoLeft = false;
final double priorImpurity = useXGBoostMissingValueHandling ? targetPriors.getPriorImpurity() : impCriterion.getPartitionImpurity(subtractMissingClassCounts(targetPriors.getDistribution(), missingClassCounts), totalWeight);
// no need to iterate over full list because at least one value must remain on the other side of the split
for (int i = 0; i < attVals.length - 1; i++) {
CombinedAttributeValues currAttVal = attVals[i];
sumCurrPartitionWeight += currAttVal.m_totalWeight;
sumRemainingWeights -= currAttVal.m_totalWeight;
assert sumCurrPartitionWeight + sumRemainingWeights == totalWeight : "The weights of the partitions do not sum up to the total weight.";
targetFrequenciesCurrentPartition = targetFrequenciesCurrentPartition.add(currAttVal.m_classFrequencyVector);
targetFrequenciesRemaining = targetFrequenciesRemaining.subtract(currAttVal.m_classFrequencyVector);
currPartitionBitMask = currPartitionBitMask.or(currAttVal.m_bitMask);
boolean partitionIsRightBranch = currPartitionBitMask.testBit(highestBitPosition);
boolean isValidSplit;
double gain;
boolean tempMissingsGoLeft = true;
if (branchContainsMissingValues && useXGBoostMissingValueHandling) {
// send missing values with partition
boolean isValidSplitFirst = sumCurrPartitionWeight + missingWeight >= minChildSize && sumRemainingWeights >= minChildSize;
binaryImpurityValues[0] = impCriterion.getPartitionImpurity(addMissingClassCounts(targetFrequenciesCurrentPartition.toArray(), missingClassCounts), sumCurrPartitionWeight + missingWeight);
binaryImpurityValues[1] = impCriterion.getPartitionImpurity(targetFrequenciesRemaining.toArray(), sumRemainingWeights);
binaryPartitionWeights[0] = sumCurrPartitionWeight + missingWeight;
binaryPartitionWeights[1] = sumRemainingWeights;
double postSplitImpurity = impCriterion.getPostSplitImpurity(binaryImpurityValues, binaryPartitionWeights, totalWeight + missingWeight);
double gainFirst = impCriterion.getGain(priorImpurity, postSplitImpurity, binaryPartitionWeights, totalWeight + missingWeight);
// send missing values with remaining
boolean isValidSplitSecond = sumCurrPartitionWeight >= minChildSize && sumRemainingWeights + missingWeight >= minChildSize;
binaryImpurityValues[0] = impCriterion.getPartitionImpurity(targetFrequenciesCurrentPartition.toArray(), sumCurrPartitionWeight);
binaryImpurityValues[1] = impCriterion.getPartitionImpurity(addMissingClassCounts(targetFrequenciesRemaining.toArray(), missingClassCounts), sumRemainingWeights + missingWeight);
binaryPartitionWeights[0] = sumCurrPartitionWeight;
binaryPartitionWeights[1] = sumRemainingWeights + missingWeight;
postSplitImpurity = impCriterion.getPostSplitImpurity(binaryImpurityValues, binaryPartitionWeights, totalWeight + missingWeight);
double gainSecond = impCriterion.getGain(priorImpurity, postSplitImpurity, binaryPartitionWeights, totalWeight + missingWeight);
// choose alternative with better gain
if (gainFirst >= gainSecond) {
gain = gainFirst;
isValidSplit = isValidSplitFirst;
tempMissingsGoLeft = !partitionIsRightBranch;
} else {
gain = gainSecond;
isValidSplit = isValidSplitSecond;
tempMissingsGoLeft = partitionIsRightBranch;
}
} else {
// TODO if invalid splits should not be considered skip partition
isValidSplit = sumCurrPartitionWeight >= minChildSize && sumRemainingWeights >= minChildSize;
binaryImpurityValues[0] = impCriterion.getPartitionImpurity(targetFrequenciesCurrentPartition.toArray(), sumCurrPartitionWeight);
binaryImpurityValues[1] = impCriterion.getPartitionImpurity(targetFrequenciesRemaining.toArray(), sumRemainingWeights);
binaryPartitionWeights[0] = sumCurrPartitionWeight;
binaryPartitionWeights[1] = sumRemainingWeights;
double postSplitImpurity = impCriterion.getPostSplitImpurity(binaryImpurityValues, binaryPartitionWeights, totalWeight);
gain = impCriterion.getGain(priorImpurity, postSplitImpurity, binaryPartitionWeights, totalWeight);
}
// use random tie breaker if gains are equal
boolean randomTieBreaker = gain == bestPartitionGain ? rd.nextInt(0, 1) == 1 : false;
// store if better than before or first valid split
if (gain > bestPartitionGain || (!isBestSplitValid && isValidSplit) || randomTieBreaker) {
if (isValidSplit || !isBestSplitValid) {
bestPartitionGain = gain;
bestPartitionMask = partitionIsRightBranch ? currPartitionBitMask : BigInteger.ZERO.setBit(highestBitPosition + 1).subtract(BigInteger.ONE).xor(currPartitionBitMask);
isBestSplitValid = isValidSplit;
if (branchContainsMissingValues) {
missingsGoLeft = tempMissingsGoLeft;
// missing values are encountered during the search for the best split
// missingsGoLeft = partitionIsRightBranch;
} else {
// no missing values were encountered during the search for the best split
// missing values should be sent with the majority
missingsGoLeft = partitionIsRightBranch ? sumCurrPartitionWeight < sumRemainingWeights : sumCurrPartitionWeight >= sumRemainingWeights;
}
}
}
}
if (isBestSplitValid && bestPartitionGain > 0.0) {
if (useXGBoostMissingValueHandling) {
return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, NO_MISSED_ROWS, missingsGoLeft ? NominalBinarySplitCandidate.MISSINGS_GO_LEFT : NominalBinarySplitCandidate.MISSINGS_GO_RIGHT);
}
return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, getMissedRows(columnMemberships), NominalBinarySplitCandidate.NO_MISSINGS);
}
return null;
}
Aggregations