use of org.broadinstitute.hellbender.tools.exome.HashedListTargetCollection in project gatk-protected by broadinstitute.
the class CalculateContamination method findConfidentHomAltSites.
private static List<PileupSummary> findConfidentHomAltSites(List<PileupSummary> sites) {
if (sites.isEmpty()) {
return new ArrayList<>();
}
final TargetCollection<PileupSummary> tc = new HashedListTargetCollection<>(sites);
final double averageCoverage = sites.stream().mapToInt(PileupSummary::getTotalCount).average().getAsDouble();
final List<Double> smoothedCopyRatios = new ArrayList<>();
final List<Double> hetRatios = new ArrayList<>();
for (final PileupSummary site : sites) {
final SimpleInterval nearbySpan = new SimpleInterval(site.getContig(), Math.max(1, site.getStart() - CNV_SCALE), site.getEnd() + CNV_SCALE);
final List<PileupSummary> nearbySites = tc.targets(nearbySpan);
final double averageNearbyCopyRatio = nearbySites.stream().mapToDouble(s -> s.getTotalCount() / averageCoverage).average().orElseGet(() -> 0);
smoothedCopyRatios.add(averageNearbyCopyRatio);
final double expectedNumberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAlleleFrequency).map(x -> 2 * x * (1 - x)).sum();
final long numberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAltFraction).filter(x -> 0.4 < x && x < 0.6).count();
final double hetRatio = numberOfNearbyHets / expectedNumberOfNearbyHets;
hetRatios.add(hetRatio);
}
final double medianSmoothedCopyRatio = new Median().evaluate(smoothedCopyRatios.stream().mapToDouble(x -> x).toArray());
final List<Integer> indicesWithAnomalousCopyRatio = IntStream.range(0, sites.size()).filter(n -> smoothedCopyRatios.get(n) < 0.8 * medianSmoothedCopyRatio || smoothedCopyRatios.get(n) > 2 * medianSmoothedCopyRatio).boxed().collect(Collectors.toList());
final double meanHetRatio = hetRatios.stream().mapToDouble(x -> x).average().getAsDouble();
final List<Integer> indicesWithLossOfHeterozygosity = IntStream.range(0, sites.size()).filter(n -> hetRatios.get(n) < meanHetRatio * 0.5).boxed().collect(Collectors.toList());
//TODO: as extra security, filter out sites that are near too many hom alts
logger.info(String.format("Excluding %d sites with low or high copy ratio and %d sites with potential loss of heterozygosity", indicesWithAnomalousCopyRatio.size(), indicesWithLossOfHeterozygosity.size()));
logger.info(String.format("The average ratio of hets within distance %d to theoretically expected number of hets is %.3f", CNV_SCALE, meanHetRatio));
final Set<Integer> badSites = new TreeSet<>();
badSites.addAll(indicesWithAnomalousCopyRatio);
badSites.addAll(indicesWithLossOfHeterozygosity);
return IntStream.range(0, sites.size()).filter(n -> !badSites.contains(n)).mapToObj(sites::get).filter(s -> s.getAltFraction() > 0.8).collect(Collectors.toList());
}
use of org.broadinstitute.hellbender.tools.exome.HashedListTargetCollection in project gatk by broadinstitute.
the class CalculateContamination method findConfidentHomAltSites.
private static List<PileupSummary> findConfidentHomAltSites(List<PileupSummary> sites) {
if (sites.isEmpty()) {
return new ArrayList<>();
}
final TargetCollection<PileupSummary> tc = new HashedListTargetCollection<>(sites);
final double averageCoverage = sites.stream().mapToInt(PileupSummary::getTotalCount).average().getAsDouble();
final List<Double> smoothedCopyRatios = new ArrayList<>();
final List<Double> hetRatios = new ArrayList<>();
for (final PileupSummary site : sites) {
final SimpleInterval nearbySpan = new SimpleInterval(site.getContig(), Math.max(1, site.getStart() - CNV_SCALE), site.getEnd() + CNV_SCALE);
final List<PileupSummary> nearbySites = tc.targets(nearbySpan);
final double averageNearbyCopyRatio = nearbySites.stream().mapToDouble(s -> s.getTotalCount() / averageCoverage).average().orElseGet(() -> 0);
smoothedCopyRatios.add(averageNearbyCopyRatio);
final double expectedNumberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAlleleFrequency).map(x -> 2 * x * (1 - x)).sum();
final long numberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAltFraction).filter(x -> 0.4 < x && x < 0.6).count();
final double hetRatio = numberOfNearbyHets / expectedNumberOfNearbyHets;
hetRatios.add(hetRatio);
}
final double medianSmoothedCopyRatio = new Median().evaluate(smoothedCopyRatios.stream().mapToDouble(x -> x).toArray());
final List<Integer> indicesWithAnomalousCopyRatio = IntStream.range(0, sites.size()).filter(n -> smoothedCopyRatios.get(n) < 0.8 * medianSmoothedCopyRatio || smoothedCopyRatios.get(n) > 2 * medianSmoothedCopyRatio).boxed().collect(Collectors.toList());
final double meanHetRatio = hetRatios.stream().mapToDouble(x -> x).average().getAsDouble();
final List<Integer> indicesWithLossOfHeterozygosity = IntStream.range(0, sites.size()).filter(n -> hetRatios.get(n) < meanHetRatio * 0.5).boxed().collect(Collectors.toList());
//TODO: as extra security, filter out sites that are near too many hom alts
logger.info(String.format("Excluding %d sites with low or high copy ratio and %d sites with potential loss of heterozygosity", indicesWithAnomalousCopyRatio.size(), indicesWithLossOfHeterozygosity.size()));
logger.info(String.format("The average ratio of hets within distance %d to theoretically expected number of hets is %.3f", CNV_SCALE, meanHetRatio));
final Set<Integer> badSites = new TreeSet<>();
badSites.addAll(indicesWithAnomalousCopyRatio);
badSites.addAll(indicesWithLossOfHeterozygosity);
return IntStream.range(0, sites.size()).filter(n -> !badSites.contains(n)).mapToObj(sites::get).filter(s -> s.getAltFraction() > 0.8).collect(Collectors.toList());
}
Aggregations