Search in sources :

Example 1 with ContigDictComparator

use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.

the class MantaMerger method doWork.

@Override
public int doWork(final List<String> args) {
    VariantContextWriter out = null;
    try {
        final Map<String, VcfInput> sample2inputs = new TreeMap<>();
        SAMSequenceDictionary dict = null;
        final List<String> lines;
        if (args.size() == 1 && args.get(0).endsWith(".list")) {
            lines = Files.lines(Paths.get(args.get(0))).filter(L -> !(StringUtils.isBlank(L) || L.startsWith("#"))).collect(Collectors.toList());
        } else {
            lines = args;
        }
        for (final String line : lines) {
            final String[] tokens = CharSplitter.TAB.split(line);
            final VcfInput vcfInput = new VcfInput();
            vcfInput.vcfPath = Paths.get(tokens[0]);
            IOUtil.assertFileIsReadable(vcfInput.vcfPath);
            final SAMSequenceDictionary dict1 = SequenceDictionaryUtils.extractRequired(vcfInput.vcfPath);
            if (dict == null) {
                dict = dict1;
            } else if (!SequenceUtil.areSequenceDictionariesEqual(dict, dict1)) {
                throw new JvarkitException.DictionariesAreNotTheSame(dict1, dict);
            }
            if (tokens.length < 2 || StringUtils.isBlank(tokens[1])) {
                try (VCFReader r = VCFReaderFactory.makeDefault().open(vcfInput.vcfPath, false)) {
                    List<String> snl = r.getHeader().getSampleNamesInOrder();
                    if (snl.size() == 1) {
                        vcfInput.sample = snl.get(0);
                    } else {
                        vcfInput.sample = vcfInput.vcfPath.toString();
                    }
                }
            } else {
                vcfInput.sample = tokens[1];
            }
            if (sample2inputs.containsKey(vcfInput.sample)) {
                LOG.error("duplicate sample " + vcfInput.sample);
                return -1;
            }
            sample2inputs.put(vcfInput.sample, vcfInput);
        }
        if (sample2inputs.isEmpty()) {
            LOG.error("no input found");
            return -1;
        }
        if (!StringUtils.isBlank(this.limitContig) && dict.getSequence(this.limitContig) == null) {
            LOG.error(JvarkitException.ContigNotFoundInDictionary.getMessage(this.limitContig, dict));
            return -1;
        }
        final Predicate<VariantContext> bedPredicate;
        if (this.excludeBedPath != null) {
            final BedLineCodec codec = new BedLineCodec();
            final ContigNameConverter converter = ContigNameConverter.fromOneDictionary(dict);
            final IntervalTreeMap<Boolean> map = new IntervalTreeMap<>();
            try (BufferedReader br = IOUtils.openPathForBufferedReading(this.excludeBedPath)) {
                br.lines().filter(L -> !BedLine.isBedHeader(L)).map(L -> codec.decode(L)).filter(L -> L != null).filter(B -> !StringUtils.isBlank(converter.apply(B.getContig()))).map(B -> new Interval(converter.apply(B.getContig()), B.getStart(), B.getEnd())).forEach(R -> map.put(R, true));
            }
            bedPredicate = V -> !map.containsOverlapping(V);
        } else {
            bedPredicate = V -> true;
        }
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY, true));
        metaData.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Variation type"));
        final VCFInfoHeaderLine infoSvLen = new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Variation length");
        metaData.add(infoSvLen);
        final VCFInfoHeaderLine infoNSamples = new VCFInfoHeaderLine("NSAMPLES", 1, VCFHeaderLineType.Integer, "Number of samples");
        metaData.add(infoNSamples);
        final VCFInfoHeaderLine infoSamples = new VCFInfoHeaderLine("SAMPLES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "amples");
        metaData.add(infoSamples);
        final VCFFilterHeaderLine filterSameNext = new VCFFilterHeaderLine("NEXT", "next variant in VCF is the same.");
        metaData.add(filterSameNext);
        final VCFFilterHeaderLine filterSamePrev = new VCFFilterHeaderLine("PREV", "next variant in VCF is the same.");
        metaData.add(filterSamePrev);
        final VCFHeader header = new VCFHeader(metaData, sample2inputs.keySet());
        header.setSequenceDictionary(dict);
        JVarkitVersion.getInstance().addMetaData(this, header);
        out = VCFUtils.createVariantContextWriterToPath(this.outputFile);
        out.writeHeader(header);
        final Decoy decoy = Decoy.getDefaultInstance();
        for (final SAMSequenceRecord ssr : dict.getSequences()) {
            if (!StringUtils.isBlank(this.limitContig)) {
                if (!ssr.getSequenceName().equals(this.limitContig))
                    continue;
            }
            LOG.info("contig " + ssr.getSequenceName());
            if (decoy.isDecoy(ssr.getSequenceName()))
                continue;
            final Map<SVKey, Set<String>> variants2samples = new HashMap<>();
            for (final VcfInput vcfinput : sample2inputs.values()) {
                // reset count for this contig
                vcfinput.contigCount = 0;
                try (VCFReader vcfFileReader = VCFReaderFactory.makeDefault().open(vcfinput.vcfPath, true)) {
                    vcfFileReader.query(ssr.getSequenceName(), 1, ssr.getSequenceLength()).stream().filter(V -> discard_bnd == false || !V.getAttributeAsString(VCFConstants.SVTYPE, "").equals("BND")).filter(bedPredicate).map(V -> new VariantContextBuilder(V).unfiltered().noID().noGenotypes().rmAttribute("EVENT").rmAttribute("HOMSEQ").rmAttribute("HOMLEN").rmAttribute("SVINSSEQ").rmAttribute("SVINSLEN").rmAttribute("MATEID").rmAttribute("LEFT_SVINSSEQ").rmAttribute("RIGHT_SVINSSEQ").rmAttribute("BND_DEPTH").rmAttribute("MATE_BND_DEPTH").rmAttribute("JUNCTION_QUAL").rmAttribute("CIGAR").make()).forEach(V -> {
                        final SVKey key1 = new SVKey(V);
                        if (!svComparator.test(V, V))
                            throw new RuntimeException("compare to self failed ! " + V);
                        variants2samples.put(key1, new HashSet<>());
                        vcfinput.contigCount++;
                    });
                }
            }
            if (variants2samples.isEmpty())
                continue;
            // build an interval tree for a faster access
            final IntervalTree<SVKey> intervalTree = new IntervalTree<>();
            for (final SVKey key : variants2samples.keySet()) {
                final SimpleInterval r = new SimpleInterval(key.archetype).extend(this.svComparator.getBndDistance() + 1);
                intervalTree.put(r.getStart(), r.getEnd(), key);
                // paranoidcheck interval is ok to find current archetype
                boolean found = false;
                final Iterator<IntervalTree.Node<SVKey>> nodeIter = intervalTree.overlappers(r.getStart(), r.getEnd());
                while (nodeIter.hasNext()) {
                    final SVKey key1 = nodeIter.next().getValue();
                    if (this.svComparator.test(key1.archetype, key.archetype)) {
                        found = true;
                        break;
                    }
                }
                if (!found) {
                    out.close();
                    throw new RuntimeException("cannot find self " + key.archetype + " in " + r);
                }
            }
            for (final VcfInput vcfinput : sample2inputs.values()) {
                try (VCFReader vcfFileReader = VCFReaderFactory.makeDefault().open(vcfinput.vcfPath, true)) {
                    final CloseableIterator<VariantContext> iter = vcfFileReader.query(ssr.getSequenceName(), 1, ssr.getSequenceLength());
                    while (iter.hasNext()) {
                        final VariantContext ctx = iter.next();
                        if (this.discard_bnd && ctx.getAttributeAsString(VCFConstants.SVTYPE, "").equals("BND"))
                            continue;
                        if (!bedPredicate.test(ctx))
                            continue;
                        final SimpleInterval r = new SimpleInterval(ctx).extend(this.svComparator.getBndDistance() + 1);
                        final Iterator<IntervalTree.Node<SVKey>> nodeIter = intervalTree.overlappers(r.getStart(), r.getEnd());
                        while (nodeIter.hasNext()) {
                            final SVKey key1 = nodeIter.next().getValue();
                            if (!this.svComparator.test(key1.archetype, ctx))
                                continue;
                            final Set<String> samples = variants2samples.get(key1);
                            samples.add(vcfinput.sample);
                        }
                    }
                    iter.close();
                }
            }
            final Comparator<VariantContext> sorter = new ContigDictComparator(dict).createLocatableComparator();
            final List<SVKey> orderedKeys = variants2samples.keySet().stream().filter(// no samples for this key ??!
            K -> !variants2samples.get(K).isEmpty()).sorted((A, B) -> sorter.compare(A.archetype, B.archetype)).collect(Collectors.toCollection(ArrayList::new));
            // remove duplicates
            int i = 0;
            while (i + 1 < orderedKeys.size()) {
                final SVKey key1 = orderedKeys.get(i);
                final SVKey key2 = orderedKeys.get(i + 1);
                if (svComparator.test(key1.archetype, key2.archetype) && // same samples
                variants2samples.get(key1).equals(variants2samples.get(key2))) {
                    orderedKeys.remove(i + 1);
                } else {
                    i++;
                }
            }
            for (int key_index = 0; key_index < orderedKeys.size(); key_index++) {
                final SVKey key = orderedKeys.get(key_index);
                final Set<String> samples = variants2samples.get(key);
                final Allele refAllele = key.archetype.getReference();
                final Allele altAllele = Allele.create("<SV>", false);
                final Object svType = key.archetype.getAttribute(VCFConstants.SVTYPE, ".");
                final VariantContextBuilder vcb = new VariantContextBuilder();
                vcb.chr(key.archetype.getContig());
                vcb.start(key.archetype.getStart());
                vcb.stop(key.archetype.getEnd());
                vcb.log10PError(key.archetype.getLog10PError());
                vcb.alleles(Arrays.asList(refAllele, altAllele));
                vcb.attribute(VCFConstants.END_KEY, key.archetype.getEnd());
                vcb.attribute(VCFConstants.SVTYPE, svType);
                vcb.attribute(infoSvLen.getID(), (svType.equals("DEL") ? -1 : 1) * key.archetype.getLengthOnReference());
                vcb.attribute(infoNSamples.getID(), samples.size());
                vcb.attribute(infoSamples.getID(), samples.stream().sorted().collect(Collectors.toList()));
                int ac = 0;
                final List<Genotype> genotypes = new ArrayList<>(sample2inputs.size());
                for (final String sn : sample2inputs.keySet()) {
                    List<Allele> gta;
                    if (samples.contains(sn)) {
                        ac++;
                        gta = Arrays.asList(refAllele, altAllele);
                    } else {
                        gta = Arrays.asList(refAllele, refAllele);
                    }
                    genotypes.add(new GenotypeBuilder(sn, gta).make());
                }
                vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, ac);
                vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, sample2inputs.size() * 2);
                if (ac > 0) {
                    vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, ac / (float) sample2inputs.size() * 2);
                }
                if (key_index > 0 && svComparator.test(key.archetype, orderedKeys.get(key_index - 1).archetype)) {
                    vcb.filter(filterSamePrev.getID());
                }
                if (key_index + 1 < orderedKeys.size() && svComparator.test(key.archetype, orderedKeys.get(key_index + 1).archetype)) {
                    System.err.println("SAME\n" + key.archetype + "\n" + orderedKeys.get(key_index + 1).archetype);
                    vcb.filter(filterSameNext.getID());
                }
                vcb.genotypes(genotypes);
                out.add(vcb.make());
            }
        }
        out.close();
        out = null;
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(out);
    }
}
Also used : Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) IOUtil(htsjdk.samtools.util.IOUtil) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) Map(java.util.Map) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Predicate(java.util.function.Predicate) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) CloseableIterator(htsjdk.samtools.util.CloseableIterator) SequenceUtil(htsjdk.samtools.util.SequenceUtil) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) HashMap(java.util.HashMap) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Decoy(com.github.lindenb.jvarkit.samtools.Decoy) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Interval(htsjdk.samtools.util.Interval) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) VCFConstants(htsjdk.variant.vcf.VCFConstants) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) Iterator(java.util.Iterator) Files(java.nio.file.Files) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFReader(htsjdk.variant.vcf.VCFReader) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) StructuralVariantComparator(com.github.lindenb.jvarkit.variant.sv.StructuralVariantComparator) IntervalTree(htsjdk.samtools.util.IntervalTree) TreeMap(java.util.TreeMap) Paths(java.nio.file.Paths) BufferedReader(java.io.BufferedReader) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Comparator(java.util.Comparator) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) IntervalTree(htsjdk.samtools.util.IntervalTree) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) HashSet(java.util.HashSet) Decoy(com.github.lindenb.jvarkit.samtools.Decoy) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) Interval(htsjdk.samtools.util.Interval) Set(java.util.Set) HashSet(java.util.HashSet) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) VCFReader(htsjdk.variant.vcf.VCFReader) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) TreeMap(java.util.TreeMap) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) BufferedReader(java.io.BufferedReader) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap)

Example 2 with ContigDictComparator

use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.

the class VcfScanUpstreamOrf method beforeVcf.

@Override
protected int beforeVcf() {
    try {
        this.indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx);
        final SAMSequenceDictionary refDict = SequenceDictionaryUtils.extractRequired(this.indexedFastaSequenceFile);
        this.refCtgNameConverter = ContigNameConverter.fromOneDictionary(refDict);
        try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
            gtfReader.setContigNameConverter(this.refCtgNameConverter);
            // tmp IntervalTreeMap for gene, will be used to remove uORF overlapping alternate transcript with CDS */
            final IntervalTreeMap<List<Transcript>> tmpTreeMap = new IntervalTreeMap<>();
            gtfReader.getAllGenes().stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.hasCodonStartDefined() && T.hasCodonStopDefined()).filter(T -> T.getTranscriptUTR5().isPresent()).filter(T -> (!this.plus_strand_only || T.isPositiveStrand())).forEach(T -> {
                final Interval interval = new Interval(T);
                List<Transcript> L = tmpTreeMap.get(interval);
                if (L == null) {
                    L = new ArrayList<>();
                    tmpTreeMap.put(interval, L);
                }
                L.add(T);
            });
            for (final Transcript transcript : tmpTreeMap.values().stream().flatMap(L -> L.stream()).collect(Collectors.toList())) {
                final Interval interval = transcript.getTranscriptUTR5().get().toInterval();
                if (this.exclude_cds_overlaping_alternative) {
                    if (tmpTreeMap.getOverlapping(interval).stream().flatMap(L -> L.stream()).filter(// same object in memory
                    G -> G != transcript).anyMatch(K -> overlapCDS(transcript, K))) {
                        continue;
                    }
                }
                List<Transcript> L = this.transcriptMap.get(interval);
                if (L == null) {
                    L = new ArrayList<>();
                    this.transcriptMap.put(interval, L);
                }
                if (this.canonical_utr) {
                    if (L.stream().map(K -> K.getTranscriptUTR5().get().toInterval()).anyMatch(R -> R.isNegativeStrand() == interval.isNegativeStrand() && R.equals(interval)))
                        continue;
                }
                L.add(transcript);
            }
            LOG.info("number of transcripts :" + this.transcriptMap.values().stream().flatMap(L -> L.stream()).count());
        }
        if (this.transcriptMap.isEmpty()) {
            LOG.error("no transcripts found in " + this.gtfPath);
            return -1;
        }
        if (this.archivePath != null) {
            try (final ArchiveFactory archive = ArchiveFactory.open(this.archivePath)) {
                final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.indexedFastaSequenceFile);
                final ContigDictComparator ctgCmp = new ContigDictComparator(dict);
                final Comparator<Locatable> locCmp1 = (A, B) -> {
                    int i = ctgCmp.compare(A.getContig(), B.getContig());
                    if (i != 0)
                        return i;
                    i = Integer.compare(A.getStart(), B.getStart());
                    if (i != 0)
                        return i;
                    return Integer.compare(A.getEnd(), B.getEnd());
                };
                final PrintWriter pw1 = archive.openWriter("uorf-dna.fa");
                final PrintWriter pw2 = archive.openWriter("uorf-pep.fa");
                final PrintWriter pw3 = archive.openWriter("uorf.bed");
                pw3.println("track name=\"uORF\" description=\"uORF for " + this.gtfPath + "\"");
                pw3.println("#chrom\tchromStart\tchromEnd\tname\tscore\tstrand\tthickStart\tthickEnd\titemRgb\tblockCount\tblockSizes\tblockStarts");
                final ProgressFactory.Watcher<Locatable> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
                for (final Transcript kg : this.transcriptMap.values().stream().flatMap(L -> L.stream()).sorted(locCmp1).collect(Collectors.toList())) {
                    /* new reference sequence */
                    if (this.genomicSequence == null || !this.genomicSequence.getChrom().equals(kg.getContig())) {
                        this.genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, kg.getContig());
                    }
                    final TranscriptRNA mRNA = new TranscriptRNA(kg);
                    final UpstreamORF uorf = new UpstreamORF(mRNA);
                    progress.apply(new Interval(uorf.getContig(), uorf.getChromStart() + 1, uorf.getChromEnd()));
                    final OpenReadingFrame orf = uorf.getBestOpenReadingFrame();
                    if (orf == null)
                        continue;
                    orf.printFastaDNA(pw1);
                    orf.printFastaPep(pw2);
                    orf.printBed(pw3);
                }
                ;
                pw1.flush();
                pw1.close();
                pw2.flush();
                pw2.close();
                pw3.flush();
                pw3.close();
                progress.close();
            }
        }
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    }
}
Also used : Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) VCFHeader(htsjdk.variant.vcf.VCFHeader) AbstractCharSequence(com.github.lindenb.jvarkit.lang.AbstractCharSequence) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) Map(java.util.Map) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) ReferenceSequenceFileFactory(htsjdk.samtools.reference.ReferenceSequenceFileFactory) List(java.util.List) KozakSequence(com.github.lindenb.jvarkit.util.bio.KozakSequence) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) VCFIterator(htsjdk.variant.vcf.VCFIterator) Parameter(com.beust.jcommander.Parameter) Exon(com.github.lindenb.jvarkit.util.bio.structure.Exon) AcidNucleics(com.github.lindenb.jvarkit.util.bio.AcidNucleics) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Interval(htsjdk.samtools.util.Interval) GeneticCode(com.github.lindenb.jvarkit.util.bio.GeneticCode) LinkedHashSet(java.util.LinkedHashSet) Locatable(htsjdk.samtools.util.Locatable) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) UTR(com.github.lindenb.jvarkit.util.bio.structure.UTR) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) OnePassVcfLauncher(com.github.lindenb.jvarkit.jcommander.OnePassVcfLauncher) Paranoid(com.github.lindenb.jvarkit.lang.Paranoid) Comparator(java.util.Comparator) Algorithms(com.github.lindenb.jvarkit.util.Algorithms) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) List(java.util.List) ArrayList(java.util.ArrayList) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Interval(htsjdk.samtools.util.Interval) Locatable(htsjdk.samtools.util.Locatable) PrintWriter(java.io.PrintWriter)

Example 3 with ContigDictComparator

use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.

the class MergeStructuralVariants method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.max_distance < 0) {
        LOG.error("bad max_distance :" + this.max_distance);
        return -1;
    }
    final List<VcfInput> vcf_readers = new ArrayList<>();
    try {
        SAMSequenceDictionary dict = null;
        final List<Path> inputPaths = (IOUtils.unrollPaths(args));
        if (inputPaths.isEmpty()) {
            LOG.error("input is empty");
            return -1;
        }
        final Set<VCFHeaderLine> metadata = new HashSet<>();
        for (int i = 0; i < inputPaths.size(); i++) {
            final VcfInput vcfInput = new VcfInput(inputPaths.get(i));
            vcf_readers.add(vcfInput);
            if (dict == null) {
                dict = vcfInput.dict;
            } else if (!SequenceUtil.areSequenceDictionariesEqual(dict, vcfInput.dict)) {
                LOG.error(JvarkitException.DictionariesAreNotTheSame.getMessage(vcfInput.dict, dict));
                return -1;
            }
        }
        final Comparator<String> contigComparator = new ContigDictComparator(dict);
        final Comparator<CnvCall> comparator = (A, B) -> {
            int i = contigComparator.compare(A.getContig(), B.getContig());
            if (i != 0)
                return i;
            i = Integer.compare(A.getStart(), B.getStart());
            if (i != 0)
                return i;
            i = Integer.compare(A.getEnd(), B.getEnd());
            if (i != 0)
                return i;
            i = A.getType().compareTo(B.getType());
            return i;
        };
        VCFStandardHeaderLines.addStandardFormatLines(metadata, true, VCFConstants.GENOTYPE_KEY);
        VCFStandardHeaderLines.addStandardInfoLines(metadata, true, VCFConstants.END_KEY);
        metadata.add(new VCFInfoHeaderLine("SAMPLES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples carrying the SV"));
        metadata.add(new VCFInfoHeaderLine("NSAMPLES", 1, VCFHeaderLineType.Integer, "Number of Samples carrying the SV"));
        metadata.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "SV length"));
        metadata.add(new VCFInfoHeaderLine("CIPOS", 2, VCFHeaderLineType.Integer, "Confidence interval around POS for imprecise variants"));
        metadata.add(new VCFInfoHeaderLine("CIEND", 2, VCFHeaderLineType.Integer, "Confidence interval around END for imprecise variants"));
        metadata.add(new VCFInfoHeaderLine("IMPRECISE", 0, VCFHeaderLineType.Flag, "Imprecise structural variation"));
        /*metadata.add(new VCFFormatHeaderLine(
					"OV",1,
					VCFHeaderLineType.Integer,
					"Number calls (with different sample) overlapping this genotype"
					));*/
        metadata.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "SV type"));
        final VCFHeader header = new VCFHeader(metadata, (Set<String>) vcf_readers.stream().map(F -> F.sample).collect(Collectors.toCollection(TreeSet::new)));
        header.setSequenceDictionary(dict);
        JVarkitVersion.getInstance().addMetaData(this, header);
        try (VariantContextWriter out = this.writingVariantsDelegate.open(this.outputFile)) {
            out.writeHeader(header);
            for (int tid = 0; tid < dict.size(); tid++) {
                final SAMSequenceRecord ssr = dict.getSequence(tid);
                for (VcfInput vin : vcf_readers) {
                    vin.loadVariants(ssr);
                }
                final List<CnvCall> intervals_list = vcf_readers.stream().flatMap(F -> F.callMap.values().stream()).collect(Collectors.toSet()).stream().sorted(comparator).collect(Collectors.toList());
                for (final CnvCall baseCall : intervals_list) {
                    if (baseCall.echoed_flag)
                        continue;
                    final Allele ref = Allele.create("N", true);
                    final Allele alt = Allele.create("<" + baseCall.svType + ">", false);
                    final List<Allele> alleles = Arrays.asList(ref, alt);
                    final List<CnvCall> callsToPrint = vcf_readers.stream().flatMap(F -> F.callMap.getOverlapping(baseCall).stream().filter(C -> C.getType().equals(baseCall.getType())).filter(C -> !C.echoed_flag).filter(C -> testOverlapping(C, baseCall))).collect(Collectors.toList());
                    if (callsToPrint.isEmpty()) {
                        throw new IllegalStateException();
                    }
                    final VariantContextBuilder vcb = new VariantContextBuilder();
                    vcb.chr(baseCall.getContig());
                    vcb.start(baseCall.getStart());
                    vcb.stop(baseCall.getEnd());
                    vcb.attribute(VCFConstants.END_KEY, baseCall.getEnd());
                    vcb.attribute(VCFConstants.SVTYPE, baseCall.getType());
                    vcb.attribute("SVLEN", CoordMath.getLength(baseCall.getStart(), baseCall.getEnd()));
                    for (int side = 0; side < 2; side++) {
                        final Function<CnvCall, Integer> coordExtractor;
                        if (side == 0) {
                            coordExtractor = C -> C.getStart();
                        } else {
                            coordExtractor = C -> C.getEnd();
                        }
                        final List<Integer> list = Arrays.asList(callsToPrint.stream().mapToInt(C -> coordExtractor.apply(C) - coordExtractor.apply(baseCall)).min().orElse(0), callsToPrint.stream().mapToInt(C -> coordExtractor.apply(C) - coordExtractor.apply(baseCall)).max().orElse(0));
                        vcb.attribute(side == 0 ? "CIPOS" : "CIEND", list);
                    }
                    vcb.attribute("IMPRECISE", true);
                    final Map<String, Genotype> sample2gt = new HashMap<>(callsToPrint.size());
                    for (final CnvCall call : callsToPrint) {
                        if (sample2gt.containsKey(call.getSample())) {
                            LOG.warn("Sample " + call.getSample() + " exits twice at the same loc " + call + " could be two small SV overlapping a big one.");
                            continue;
                        }
                        call.echoed_flag = true;
                        // final List<Allele> alleles=Arrays.asList(ref,alt);
                        final GenotypeBuilder gb = new GenotypeBuilder(call.sample, alleles);
                        sample2gt.put(call.getSample(), gb.make());
                    }
                    vcb.attribute("SAMPLES", new ArrayList<>(sample2gt.keySet()));
                    vcb.attribute("NSAMPLES", sample2gt.size());
                    vcb.genotypes(sample2gt.values());
                    vcb.alleles(alleles);
                    out.add(vcb.make());
                }
            // end of loop interval
            }
        /*
				all_calls.values().stream().
					flatMap(C->C.stream()).
					filter(C->!C.echoed_flag).
					forEach(C->LOG.warn("Bug: Not printed "+C));
				 */
        }
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        for (VcfInput vi : vcf_readers) try {
            vi.close();
        } catch (IOException err) {
        }
    }
}
Also used : WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) Map(java.util.Map) Path(java.nio.file.Path) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) GenotypeType(htsjdk.variant.variantcontext.GenotypeType) Set(java.util.Set) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) CoordMath(htsjdk.samtools.util.CoordMath) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) SequenceUtil(htsjdk.samtools.util.SequenceUtil) Parameter(com.beust.jcommander.Parameter) HashMap(java.util.HashMap) Function(java.util.function.Function) TreeSet(java.util.TreeSet) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Interval(htsjdk.samtools.util.Interval) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) VCFConstants(htsjdk.variant.vcf.VCFConstants) Locatable(htsjdk.samtools.util.Locatable) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFReader(htsjdk.variant.vcf.VCFReader) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) FractionConverter(com.github.lindenb.jvarkit.jcommander.converter.FractionConverter) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Comparator(java.util.Comparator) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) Path(java.nio.file.Path) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOException(java.io.IOException) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder)

Example 4 with ContigDictComparator

use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.

the class MergeCnvNator method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.region_are_same_ratio <= 0 || this.region_are_same_ratio > 1) {
        LOG.error("bad region_are_same_ratio :" + this.region_are_same_ratio);
        return -1;
    }
    if (this.treshold < 0 || this.treshold >= 0.5) {
        LOG.error("Bad treshold 0< " + this.treshold + " < 0.5");
        return -1;
    }
    try {
        final List<Path> inputs = IOUtils.unrollPaths(args);
        if (inputs.isEmpty()) {
            LOG.error("input is empty");
            return -1;
        }
        final SAMSequenceDictionary dict;
        final ContigNameConverter contigNameConverter;
        if (this.dictRefFile != null) {
            dict = SAMSequenceDictionaryExtractor.extractDictionary(this.dictRefFile);
            contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
        } else {
            contigNameConverter = null;
            dict = null;
        }
        final IntervalTreeMap<Boolean> limitBedIntervalTreeMap;
        if (this.includeBed != null) {
            limitBedIntervalTreeMap = new IntervalTreeMap<>();
            final BedLineCodec codec = new BedLineCodec();
            try (BufferedReader br = IOUtils.openPathForBufferedReading(this.includeBed)) {
                br.lines().filter(S -> !StringUtils.isBlank(S) && !BedLine.isBedHeader(S)).map(L -> codec.decode(L)).filter(B -> B != null).forEach(B -> {
                    final String ctg = contigNameConverter == null ? B.getContig() : contigNameConverter.apply(B.getContig());
                    if (StringUtils.isBlank(ctg))
                        return;
                    limitBedIntervalTreeMap.put(new Interval(ctg, B.getStart(), B.getEnd()), Boolean.TRUE);
                });
            }
        } else {
            limitBedIntervalTreeMap = null;
        }
        final Set<CNVNatorInterval> intervals_set = new HashSet<>();
        final IntervalTreeMap<List<CnvNatorCall>> all_calls = new IntervalTreeMap<>();
        final Set<String> all_samples = new TreeSet<>();
        for (final Path input : inputs) {
            final String fileSample;
            LOG.info("Reading " + input);
            if (this.inputType.equals(InputType.cnvnator)) {
                fileSample = IOUtils.getFilenameWithoutCommonSuffixes(input);
                all_samples.add(fileSample);
            } else {
                fileSample = null;
            }
            final BufferedReader br = IOUtils.openPathForBufferedReading(input);
            String line;
            while ((line = br.readLine()) != null) {
                if (StringUtil.isBlank(line) || line.startsWith("#"))
                    continue;
                final String[] tokens = CharSplitter.TAB.split(line);
                final CnvNatorCall call;
                if (this.inputType.equals(InputType.cnvnator)) {
                    call = new CnvNatorCall(fileSample, Arrays.asList(tokens));
                } else {
                    call = new CnvNatorCall(null, Arrays.asList(tokens));
                    all_samples.add(call.sample);
                }
                if (limitBedIntervalTreeMap != null && !limitBedIntervalTreeMap.containsOverlapping(call)) {
                    continue;
                }
                if (call.size > this.max_cnv_size) {
                    continue;
                }
                if (dict != null && dict.getSequence(call.getContig()) == null) {
                    LOG.warn("skipping " + line + " because contig " + call.getContig() + " is not defined in dictionary");
                    continue;
                }
                intervals_set.add(call.interval);
                final Interval key = new Interval(call);
                List<CnvNatorCall> callList = all_calls.get(key);
                if (callList == null) {
                    callList = new ArrayList<>();
                    all_calls.put(key, callList);
                }
                callList.add(call);
            }
            br.close();
        }
        // contig comparator
        final Comparator<String> contigComparator;
        if (dict != null) {
            contigComparator = new ContigDictComparator(dict);
        } else {
            final SmartComparator smartComparator = new SmartComparator();
            contigComparator = (A, B) -> smartComparator.compare(A, B);
        }
        // call comparator
        final Comparator<CNVNatorInterval> comparator = (A, B) -> {
            int i = contigComparator.compare(A.getContig(), B.getContig());
            if (i != 0)
                return i;
            i = Integer.compare(A.getStart(), B.getStart());
            if (i != 0)
                return i;
            i = Integer.compare(A.getEnd(), B.getEnd());
            if (i != 0)
                return i;
            if (one_cnv_type) {
                i = A.type.compareTo(B.type);
            }
            return i;
        };
        final List<CNVNatorInterval> intervals_list = intervals_set.stream().sorted(comparator).collect(Collectors.toList());
        LOG.info("Identified " + intervals_list.size() + " intervals");
        final Set<VCFHeaderLine> metadata = new HashSet<>();
        VCFStandardHeaderLines.addStandardFormatLines(metadata, true, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_KEY);
        VCFStandardHeaderLines.addStandardInfoLines(metadata, true, VCFConstants.DEPTH_KEY, VCFConstants.END_KEY);
        metadata.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Difference in length between REF and ALT alleles"));
        metadata.add(new VCFInfoHeaderLine("IMPRECISE", 0, VCFHeaderLineType.Flag, "Imprecise structural variation"));
        metadata.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Structural variation type"));
        metadata.add(new VCFInfoHeaderLine("SAMPLES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples carrying the CNV"));
        metadata.add(new VCFInfoHeaderLine("NSAMPLES", 1, VCFHeaderLineType.Integer, "Number of Samples carrying the CNV"));
        metadata.add(new VCFInfoHeaderLine("OV", 1, VCFHeaderLineType.Integer, "Number calls overlapping this genotype region, whatever their size ( with at most fraction :" + min_overlapping_ratio + ")."));
        metadata.add(new VCFFormatHeaderLine("WARN", 1, VCFHeaderLineType.String, "WARNINGS"));
        metadata.add(new VCFFormatHeaderLine("RD", 1, VCFHeaderLineType.Float, "Normalized RD"));
        metadata.add(new VCFFormatHeaderLine("P1", 1, VCFHeaderLineType.Float, "e-val by t-test"));
        metadata.add(new VCFFormatHeaderLine("P2", 1, VCFHeaderLineType.Float, "e-val by Gaussian tail"));
        metadata.add(new VCFFormatHeaderLine("P3", 1, VCFHeaderLineType.Float, "e-val by t-test (middle)"));
        metadata.add(new VCFFormatHeaderLine("P4", 1, VCFHeaderLineType.Float, "e-val by Gaussian tail (middle)"));
        metadata.add(new VCFFormatHeaderLine("Q0", 1, VCFHeaderLineType.Float, "Fraction of reads with 0 mapping quality"));
        metadata.add(new VCFFormatHeaderLine("CN", 1, VCFHeaderLineType.Integer, "Copy number genotype for imprecise events"));
        metadata.add(new VCFFormatHeaderLine("PE", 1, VCFHeaderLineType.Integer, "Number of paired-ends that support the event"));
        metadata.add(new VCFFormatHeaderLine("OV", 1, VCFHeaderLineType.Integer, "Number calls (with different sample) overlapping this genotype"));
        final VCFHeader header = new VCFHeader(metadata, all_samples);
        JVarkitVersion.getInstance().addMetaData(this, header);
        if (dict != null)
            header.setSequenceDictionary(dict);
        try (VariantContextWriter out = this.writingVariants.dictionary(dict).open(this.outputFile)) {
            long id_generator = 0L;
            out.writeHeader(header);
            final ProgressFactory.Watcher<CNVNatorInterval> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
            String prevContig = null;
            for (final CNVNatorInterval interval : intervals_list) {
                final Set<String> warnings = new HashSet<>();
                progress.apply(interval);
                if (!interval.getContig().equals(prevContig)) {
                    // cleanup memory, increase speed ?
                    final String finalPrevCtg = prevContig;
                    all_calls.keySet().removeIf(R -> R.getContig().equals(finalPrevCtg));
                    prevContig = interval.getContig();
                }
                // find all call overlapping this call
                final List<CnvNatorCall> overlappingList = all_calls.getOverlapping(interval).stream().flatMap(L -> L.stream()).filter(C -> !one_cnv_type || C.interval.type.equals(interval.type)).filter(C -> !uniq_use_of_one_cnv || !C.echoed_flag).collect(Collectors.toList());
                if (overlappingList.isEmpty())
                    continue;
                // main call for this interval
                final CnvNatorCall baseCall = overlappingList.stream().filter(C -> C.interval.equals(interval)).findFirst().orElse(null);
                if (baseCall == null) {
                    continue;
                }
                // filter calls matching the overlapping with baseCall
                final List<CnvNatorCall> callsToPrint = overlappingList.stream().filter(C -> testOverlapping(C, baseCall)).collect(Collectors.toList());
                if (callsToPrint.isEmpty()) {
                    throw new IllegalStateException();
                }
                final VariantContextBuilder vcb = new VariantContextBuilder();
                final Set<Allele> altAlleles = new HashSet<>();
                vcb.chr(baseCall.getContig());
                vcb.start(baseCall.getStart());
                vcb.stop(baseCall.getEnd());
                vcb.attribute(VCFConstants.END_KEY, baseCall.getEnd());
                vcb.attribute("SVLEN", baseCall.size);
                vcb.attribute("IMPRECISE", true);
                final Map<String, Genotype> sample2gt = new HashMap<>(callsToPrint.size());
                for (final CnvNatorCall call : callsToPrint) {
                    if (sample2gt.containsKey(call.sample)) {
                        warnings.add("SAMPLE_" + call.sample + "_MULTIPLE");
                        LOG.warn("Sample " + call.sample + " exits twice at the same loc " + call + " could be two small SV overlapping a big one.");
                        continue;
                    }
                    if (uniq_use_of_one_cnv)
                        call.echoed_flag = true;
                    final GenotypeBuilder gb = new GenotypeBuilder(call.sample);
                    if (call.normalized_RD != null)
                        gb.attribute("RD", call.normalized_RD);
                    if (call.e_val_1 != null)
                        gb.attribute("P1", call.e_val_1);
                    if (call.e_val_2 != null)
                        gb.attribute("P2", call.e_val_2);
                    if (call.e_val_3 != null)
                        gb.attribute("P3", call.e_val_3);
                    if (call.e_val_4 != null)
                        gb.attribute("P4", call.e_val_4);
                    if (call.q0 != null)
                        gb.attribute("Q0", call.q0);
                    gb.attribute("OV", all_calls.getOverlapping(call).stream().flatMap(col -> col.stream()).filter(C -> !C.sample.equals(call.sample)).count());
                    if (call.interval.type == CnvType.deletion && call.normalized_RD != null && call.normalized_RD < this.treshold) {
                        gb.alleles(Arrays.asList(DEL_ALLELE, DEL_ALLELE));
                        altAlleles.add(DEL_ALLELE);
                        gb.attribute("CN", 0);
                    } else if (call.interval.type == CnvType.deletion && call.normalized_RD != null && call.normalized_RD >= this.treshold) {
                        gb.alleles(Arrays.asList(REF_ALLELE, DEL_ALLELE));
                        altAlleles.add(DEL_ALLELE);
                        gb.attribute("CN", 1);
                    } else if (call.interval.type == CnvType.duplication && call.normalized_RD != null && call.normalized_RD <= (1.5 + this.treshold)) {
                        gb.alleles(Arrays.asList(REF_ALLELE, DUP_ALLELE));
                        altAlleles.add(DUP_ALLELE);
                        gb.attribute("CN", 2);
                    } else if (call.interval.type == CnvType.duplication && call.normalized_RD != null && call.normalized_RD > (1.5 + this.treshold)) {
                        gb.alleles(Arrays.asList(DUP_ALLELE, DUP_ALLELE));
                        gb.attribute("CN", 9999);
                        altAlleles.add(DUP_ALLELE);
                    } else if (hom_ref_instead_of_nocall) {
                        gb.alleles(Arrays.asList(REF_ALLELE, REF_ALLELE));
                    } else {
                        gb.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));
                    }
                    sample2gt.put(call.sample, gb.make());
                }
                if (altAlleles.isEmpty()) {
                    vcb.attribute(VCFConstants.SVTYPE, "UNDEFINED");
                }
                if (altAlleles.size() == 1 && altAlleles.contains(DEL_ALLELE)) {
                    vcb.attribute(VCFConstants.SVTYPE, "DEL");
                } else if (altAlleles.size() == 1 && altAlleles.contains(DUP_ALLELE)) {
                    vcb.attribute(VCFConstants.SVTYPE, "DUP");
                } else {
                    vcb.attribute(VCFConstants.SVTYPE, "MIXED");
                }
                final List<Allele> alleles = new ArrayList<>();
                alleles.add(REF_ALLELE);
                alleles.addAll(altAlleles);
                vcb.id(String.format("cnv%05d", (++id_generator)));
                vcb.alleles(alleles);
                vcb.attribute("SAMPLES", new ArrayList<>(sample2gt.keySet()));
                vcb.attribute("NSAMPLES", sample2gt.size());
                if (!warnings.isEmpty()) {
                    vcb.attribute("WARN", new ArrayList<>(warnings));
                }
                vcb.attribute("OV", all_calls.getOverlapping(baseCall).stream().flatMap(L -> L.stream()).filter(L -> {
                    final Interval r1 = new Interval(L);
                    final Interval r2 = new Interval(baseCall);
                    if (!r1.overlaps(r2))
                        return false;
                    final double L1 = r1.getIntersectionLength(r2);
                    if ((L1 / r1.length()) < min_overlapping_ratio)
                        return false;
                    return true;
                }).count());
                vcb.genotypes(sample2gt.values());
                out.add(vcb.make());
            }
            progress.close();
        }
        if (uniq_use_of_one_cnv) {
            all_calls.values().stream().flatMap(C -> C.stream()).filter(C -> !C.echoed_flag).forEach(C -> LOG.warn("Bug: Not printed " + C));
        }
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
    }
}
Also used : WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) StringUtil(htsjdk.samtools.util.StringUtil) Map(java.util.Map) Path(java.nio.file.Path) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) List(java.util.List) SmartComparator(com.github.lindenb.jvarkit.lang.SmartComparator) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) HashMap(java.util.HashMap) Function(java.util.function.Function) TreeSet(java.util.TreeSet) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Interval(htsjdk.samtools.util.Interval) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) SAMSequenceDictionaryExtractor(htsjdk.variant.utils.SAMSequenceDictionaryExtractor) VCFConstants(htsjdk.variant.vcf.VCFConstants) Locatable(htsjdk.samtools.util.Locatable) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) FractionConverter(com.github.lindenb.jvarkit.jcommander.converter.FractionConverter) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) BufferedReader(java.io.BufferedReader) Comparator(java.util.Comparator) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TreeSet(java.util.TreeSet) List(java.util.List) ArrayList(java.util.ArrayList) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) HashSet(java.util.HashSet) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) SmartComparator(com.github.lindenb.jvarkit.lang.SmartComparator) Interval(htsjdk.samtools.util.Interval) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Path(java.nio.file.Path) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) BufferedReader(java.io.BufferedReader) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap)

Example 5 with ContigDictComparator

use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.

the class BamXtremDepth method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.min_coverage >= this.max_coverage) {
        LOG.equals("min coverage >= max_coverage");
        return -1;
    }
    try {
        final SamReaderFactory srf = SamReaderFactory.make().validationStringency(this.validationStringency).referenceSequence(this.faidxPath);
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(faidxPath);
        final List<Path> bamPaths = IOUtils.unrollPaths(args);
        if (bamPaths.isEmpty()) {
            LOG.error("No bam was defined");
            return -1;
        }
        try (PrintWriter out = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
            final boolean out_is_intervallist = (this.output_format_interval_list || (this.outputFile != null && (this.outputFile.toString().endsWith(FileExtensions.COMPRESSED_INTERVAL_LIST) || this.outputFile.toString().endsWith(FileExtensions.INTERVAL_LIST))));
            if (out_is_intervallist) {
                final SAMFileHeader outHeader = new SAMFileHeader(dict);
                JVarkitVersion.getInstance().addMetaData(this, outHeader);
                final SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
                codec.encode(out, outHeader);
            }
            for (final SAMSequenceRecord ssr : dict.getSequences()) {
                if (!this.limit_to_chromosomes.isEmpty() && !this.limit_to_chromosomes.contains(ssr.getSequenceName()))
                    continue;
                LOG.info("Scanning " + ssr.getSequenceName());
                final BitSet highList = new BitSet(ssr.getLengthOnReference());
                highList.set(0, ssr.getSequenceLength());
                final BitSet lowList = new BitSet(ssr.getLengthOnReference());
                lowList.set(0, ssr.getSequenceLength());
                runLoop(bamPaths, srf, ssr, dict, lowList, highList);
                final List<Interval> rgn = new ArrayList<>(highList.size() + lowList.size());
                bitSetToLocatables(ssr, lowList).stream().map(R -> new Interval(R.getContig(), R.getStart(), R.getEnd(), false, "LE." + this.min_coverage)).forEach(R -> rgn.add(R));
                bitSetToLocatables(ssr, highList).stream().map(R -> new Interval(R.getContig(), R.getStart(), R.getEnd(), false, "GE." + this.max_coverage)).forEach(R -> rgn.add(R));
                rgn.sort(new ContigDictComparator(dict).createLocatableComparator());
                for (final Interval R : rgn) {
                    out.print(R.getContig());
                    out.print("\t");
                    out.print(R.getStart() - (out_is_intervallist ? 0 : 1));
                    out.print("\t");
                    out.print(R.getEnd());
                    out.print("\t");
                    if (out_is_intervallist)
                        out.print("+\t");
                    out.print(R.getName());
                    out.println();
                }
            }
            out.flush();
        }
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
    }
}
Also used : Path(java.nio.file.Path) SequenceUtil(htsjdk.samtools.util.SequenceUtil) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) IntPredicate(java.util.function.IntPredicate) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ValidationStringency(htsjdk.samtools.ValidationStringency) SAMTextHeaderCodec(htsjdk.samtools.SAMTextHeaderCodec) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Interval(htsjdk.samtools.util.Interval) CoverageFactory(com.github.lindenb.jvarkit.samtools.CoverageFactory) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Path(java.nio.file.Path) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) Locatable(htsjdk.samtools.util.Locatable) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Set(java.util.Set) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) SamReader(htsjdk.samtools.SamReader) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) List(java.util.List) FileExtensions(htsjdk.samtools.util.FileExtensions) BitSet(java.util.BitSet) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SamReaderFactory(htsjdk.samtools.SamReaderFactory) SamReaderFactory(htsjdk.samtools.SamReaderFactory) BitSet(java.util.BitSet) ArrayList(java.util.ArrayList) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) SAMTextHeaderCodec(htsjdk.samtools.SAMTextHeaderCodec) SAMFileHeader(htsjdk.samtools.SAMFileHeader) PrintWriter(java.io.PrintWriter) Interval(htsjdk.samtools.util.Interval) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval)

Aggregations

ContigDictComparator (com.github.lindenb.jvarkit.util.samtools.ContigDictComparator)28 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)26 Parameter (com.beust.jcommander.Parameter)25 Program (com.github.lindenb.jvarkit.util.jcommander.Program)25 Logger (com.github.lindenb.jvarkit.util.log.Logger)25 Path (java.nio.file.Path)25 List (java.util.List)25 Collectors (java.util.stream.Collectors)23 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)22 ArrayList (java.util.ArrayList)22 Set (java.util.Set)20 HashSet (java.util.HashSet)19 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)18 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)18 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)17 VCFHeader (htsjdk.variant.vcf.VCFHeader)17 ContigNameConverter (com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter)16 CloserUtil (htsjdk.samtools.util.CloserUtil)16 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)16 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)16