Search in sources :

Example 1 with HashedListTargetCollection

use of org.broadinstitute.hellbender.tools.exome.HashedListTargetCollection in project gatk-protected by broadinstitute.

the class CalculateContamination method findConfidentHomAltSites.

private static List<PileupSummary> findConfidentHomAltSites(List<PileupSummary> sites) {
    if (sites.isEmpty()) {
        return new ArrayList<>();
    }
    final TargetCollection<PileupSummary> tc = new HashedListTargetCollection<>(sites);
    final double averageCoverage = sites.stream().mapToInt(PileupSummary::getTotalCount).average().getAsDouble();
    final List<Double> smoothedCopyRatios = new ArrayList<>();
    final List<Double> hetRatios = new ArrayList<>();
    for (final PileupSummary site : sites) {
        final SimpleInterval nearbySpan = new SimpleInterval(site.getContig(), Math.max(1, site.getStart() - CNV_SCALE), site.getEnd() + CNV_SCALE);
        final List<PileupSummary> nearbySites = tc.targets(nearbySpan);
        final double averageNearbyCopyRatio = nearbySites.stream().mapToDouble(s -> s.getTotalCount() / averageCoverage).average().orElseGet(() -> 0);
        smoothedCopyRatios.add(averageNearbyCopyRatio);
        final double expectedNumberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAlleleFrequency).map(x -> 2 * x * (1 - x)).sum();
        final long numberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAltFraction).filter(x -> 0.4 < x && x < 0.6).count();
        final double hetRatio = numberOfNearbyHets / expectedNumberOfNearbyHets;
        hetRatios.add(hetRatio);
    }
    final double medianSmoothedCopyRatio = new Median().evaluate(smoothedCopyRatios.stream().mapToDouble(x -> x).toArray());
    final List<Integer> indicesWithAnomalousCopyRatio = IntStream.range(0, sites.size()).filter(n -> smoothedCopyRatios.get(n) < 0.8 * medianSmoothedCopyRatio || smoothedCopyRatios.get(n) > 2 * medianSmoothedCopyRatio).boxed().collect(Collectors.toList());
    final double meanHetRatio = hetRatios.stream().mapToDouble(x -> x).average().getAsDouble();
    final List<Integer> indicesWithLossOfHeterozygosity = IntStream.range(0, sites.size()).filter(n -> hetRatios.get(n) < meanHetRatio * 0.5).boxed().collect(Collectors.toList());
    //TODO: as extra security, filter out sites that are near too many hom alts
    logger.info(String.format("Excluding %d sites with low or high copy ratio and %d sites with potential loss of heterozygosity", indicesWithAnomalousCopyRatio.size(), indicesWithLossOfHeterozygosity.size()));
    logger.info(String.format("The average ratio of hets within distance %d to theoretically expected number of hets is %.3f", CNV_SCALE, meanHetRatio));
    final Set<Integer> badSites = new TreeSet<>();
    badSites.addAll(indicesWithAnomalousCopyRatio);
    badSites.addAll(indicesWithLossOfHeterozygosity);
    return IntStream.range(0, sites.size()).filter(n -> !badSites.contains(n)).mapToObj(sites::get).filter(s -> s.getAltFraction() > 0.8).collect(Collectors.toList());
}
Also used : IntStream(java.util.stream.IntStream) CommandLineProgramProperties(org.broadinstitute.barclay.argparser.CommandLineProgramProperties) java.util(java.util) CommandLineProgram(org.broadinstitute.hellbender.cmdline.CommandLineProgram) Argument(org.broadinstitute.barclay.argparser.Argument) StandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) VariantProgramGroup(org.broadinstitute.hellbender.cmdline.programgroups.VariantProgramGroup) File(java.io.File) Logger(org.apache.logging.log4j.Logger) HashedListTargetCollection(org.broadinstitute.hellbender.tools.exome.HashedListTargetCollection) Median(org.apache.commons.math3.stat.descriptive.rank.Median) LogManager(org.apache.logging.log4j.LogManager) TargetCollection(org.broadinstitute.hellbender.tools.exome.TargetCollection) Median(org.apache.commons.math3.stat.descriptive.rank.Median) HashedListTargetCollection(org.broadinstitute.hellbender.tools.exome.HashedListTargetCollection) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 2 with HashedListTargetCollection

use of org.broadinstitute.hellbender.tools.exome.HashedListTargetCollection in project gatk by broadinstitute.

the class CalculateContamination method findConfidentHomAltSites.

private static List<PileupSummary> findConfidentHomAltSites(List<PileupSummary> sites) {
    if (sites.isEmpty()) {
        return new ArrayList<>();
    }
    final TargetCollection<PileupSummary> tc = new HashedListTargetCollection<>(sites);
    final double averageCoverage = sites.stream().mapToInt(PileupSummary::getTotalCount).average().getAsDouble();
    final List<Double> smoothedCopyRatios = new ArrayList<>();
    final List<Double> hetRatios = new ArrayList<>();
    for (final PileupSummary site : sites) {
        final SimpleInterval nearbySpan = new SimpleInterval(site.getContig(), Math.max(1, site.getStart() - CNV_SCALE), site.getEnd() + CNV_SCALE);
        final List<PileupSummary> nearbySites = tc.targets(nearbySpan);
        final double averageNearbyCopyRatio = nearbySites.stream().mapToDouble(s -> s.getTotalCount() / averageCoverage).average().orElseGet(() -> 0);
        smoothedCopyRatios.add(averageNearbyCopyRatio);
        final double expectedNumberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAlleleFrequency).map(x -> 2 * x * (1 - x)).sum();
        final long numberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAltFraction).filter(x -> 0.4 < x && x < 0.6).count();
        final double hetRatio = numberOfNearbyHets / expectedNumberOfNearbyHets;
        hetRatios.add(hetRatio);
    }
    final double medianSmoothedCopyRatio = new Median().evaluate(smoothedCopyRatios.stream().mapToDouble(x -> x).toArray());
    final List<Integer> indicesWithAnomalousCopyRatio = IntStream.range(0, sites.size()).filter(n -> smoothedCopyRatios.get(n) < 0.8 * medianSmoothedCopyRatio || smoothedCopyRatios.get(n) > 2 * medianSmoothedCopyRatio).boxed().collect(Collectors.toList());
    final double meanHetRatio = hetRatios.stream().mapToDouble(x -> x).average().getAsDouble();
    final List<Integer> indicesWithLossOfHeterozygosity = IntStream.range(0, sites.size()).filter(n -> hetRatios.get(n) < meanHetRatio * 0.5).boxed().collect(Collectors.toList());
    //TODO: as extra security, filter out sites that are near too many hom alts
    logger.info(String.format("Excluding %d sites with low or high copy ratio and %d sites with potential loss of heterozygosity", indicesWithAnomalousCopyRatio.size(), indicesWithLossOfHeterozygosity.size()));
    logger.info(String.format("The average ratio of hets within distance %d to theoretically expected number of hets is %.3f", CNV_SCALE, meanHetRatio));
    final Set<Integer> badSites = new TreeSet<>();
    badSites.addAll(indicesWithAnomalousCopyRatio);
    badSites.addAll(indicesWithLossOfHeterozygosity);
    return IntStream.range(0, sites.size()).filter(n -> !badSites.contains(n)).mapToObj(sites::get).filter(s -> s.getAltFraction() > 0.8).collect(Collectors.toList());
}
Also used : IntStream(java.util.stream.IntStream) CommandLineProgramProperties(org.broadinstitute.barclay.argparser.CommandLineProgramProperties) java.util(java.util) CommandLineProgram(org.broadinstitute.hellbender.cmdline.CommandLineProgram) Argument(org.broadinstitute.barclay.argparser.Argument) StandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) VariantProgramGroup(org.broadinstitute.hellbender.cmdline.programgroups.VariantProgramGroup) File(java.io.File) Logger(org.apache.logging.log4j.Logger) HashedListTargetCollection(org.broadinstitute.hellbender.tools.exome.HashedListTargetCollection) Median(org.apache.commons.math3.stat.descriptive.rank.Median) LogManager(org.apache.logging.log4j.LogManager) TargetCollection(org.broadinstitute.hellbender.tools.exome.TargetCollection) Median(org.apache.commons.math3.stat.descriptive.rank.Median) HashedListTargetCollection(org.broadinstitute.hellbender.tools.exome.HashedListTargetCollection) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Aggregations

File (java.io.File)2 java.util (java.util)2 Collectors (java.util.stream.Collectors)2 IntStream (java.util.stream.IntStream)2 Median (org.apache.commons.math3.stat.descriptive.rank.Median)2 LogManager (org.apache.logging.log4j.LogManager)2 Logger (org.apache.logging.log4j.Logger)2 Argument (org.broadinstitute.barclay.argparser.Argument)2 CommandLineProgramProperties (org.broadinstitute.barclay.argparser.CommandLineProgramProperties)2 CommandLineProgram (org.broadinstitute.hellbender.cmdline.CommandLineProgram)2 StandardArgumentDefinitions (org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions)2 VariantProgramGroup (org.broadinstitute.hellbender.cmdline.programgroups.VariantProgramGroup)2 HashedListTargetCollection (org.broadinstitute.hellbender.tools.exome.HashedListTargetCollection)2 TargetCollection (org.broadinstitute.hellbender.tools.exome.TargetCollection)2 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)2