Search in sources :

Example 1 with Intron

use of com.github.lindenb.jvarkit.util.bio.structure.Intron in project jvarkit by lindenb.

the class VcfGtfSplitter method testTranscript.

private boolean testTranscript(final Transcript transcript, final VariantContext ctx) {
    if (!transcript.overlaps(ctx)) {
        if (this.use_upstream) {
            final SimplePosition pos = new SimplePosition(transcript.getContig(), transcript.isPositiveStrand() ? transcript.getStart() : transcript.getEnd());
            if (ctx.withinDistanceOf(pos, this.xxxxstream_length))
                return true;
        }
        if (this.use_downstream) {
            final SimplePosition pos = new SimplePosition(transcript.getContig(), transcript.isPositiveStrand() ? transcript.getEnd() : transcript.getStart());
            if (ctx.withinDistanceOf(pos, this.xxxxstream_length))
                return true;
        }
        return false;
    }
    if (this.use_exon && transcript.hasExon() && transcript.getExons().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
        return true;
    if (this.use_intron && transcript.hasIntron() && transcript.getIntrons().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
        return true;
    if (this.use_cds && transcript.hasCDS() && transcript.getAllCds().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
        return true;
    if (this.use_utr5) {
        if (transcript.isPositiveStrand() && transcript.getUTR5().isPresent() && transcript.getUTR5().get().overlaps(ctx))
            return true;
        if (transcript.isNegativeStrand() && transcript.getUTR3().isPresent() && transcript.getUTR3().get().overlaps(ctx))
            return true;
    }
    if (this.use_utr3) {
        if (transcript.isPositiveStrand() && transcript.getUTR3().isPresent() && transcript.getUTR3().get().overlaps(ctx))
            return true;
        if (transcript.isNegativeStrand() && transcript.getUTR5().isPresent() && transcript.getUTR5().get().overlaps(ctx))
            return true;
    }
    if (this.use_cds_utr5) {
        if (transcript.isPositiveStrand() && transcript.getUTR5().isPresent() && transcript.getUTR5().get().getIntervals().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
            return true;
        if (transcript.isNegativeStrand() && transcript.getUTR3().isPresent() && transcript.getUTR3().get().getIntervals().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
            return true;
    }
    if (this.use_cds_utr3) {
        if (transcript.isPositiveStrand() && transcript.getUTR3().isPresent() && transcript.getUTR3().get().getIntervals().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
            return true;
        if (transcript.isNegativeStrand() && transcript.getUTR5().isPresent() && transcript.getUTR5().get().getIntervals().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
            return true;
    }
    if (this.use_stop && transcript.hasCodonStopDefined() && transcript.getCodonStop().get().getBlocks().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
        return true;
    if (this.use_start && transcript.hasCodonStartDefined() && transcript.getCodonStart().get().getBlocks().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
        return true;
    if (this.use_splice && transcript.hasIntron()) {
        for (final Intron intron : transcript.getIntrons()) {
            final SimpleInterval splice1 = new SimpleInterval(intron.getContig(), intron.getStart() - 1, intron.getStart());
            if (ctx.withinDistanceOf(splice1, this.split_length))
                return true;
            final SimpleInterval splice2 = new SimpleInterval(intron.getContig(), intron.getEnd(), intron.getEnd() + 1);
            if (ctx.withinDistanceOf(splice2, this.split_length))
                return true;
        }
    }
    return false;
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) AttributeCleaner(com.github.lindenb.jvarkit.variant.variantcontext.AttributeCleaner) Parameter(com.beust.jcommander.Parameter) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) VCFHeader(htsjdk.variant.vcf.VCFHeader) TabixIndexCreator(htsjdk.tribble.index.tabix.TabixIndexCreator) Gene(com.github.lindenb.jvarkit.util.bio.structure.Gene) VariantContextWriterBuilder(htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) OutputStream(java.io.OutputStream) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) Locatable(htsjdk.samtools.util.Locatable) Iterator(java.util.Iterator) Files(java.nio.file.Files) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFReader(htsjdk.variant.vcf.VCFReader) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) File(java.io.File) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) SimplePosition(com.github.lindenb.jvarkit.samtools.util.SimplePosition) List(java.util.List) Intron(com.github.lindenb.jvarkit.util.bio.structure.Intron) Paths(java.nio.file.Paths) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) FileExtensions(htsjdk.samtools.util.FileExtensions) Options(htsjdk.variant.variantcontext.writer.Options) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VariantContext(htsjdk.variant.variantcontext.VariantContext) Comparator(java.util.Comparator) TabixFormat(htsjdk.tribble.index.tabix.TabixFormat) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) Intron(com.github.lindenb.jvarkit.util.bio.structure.Intron) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SimplePosition(com.github.lindenb.jvarkit.samtools.util.SimplePosition)

Example 2 with Intron

use of com.github.lindenb.jvarkit.util.bio.structure.Intron in project jvarkit by lindenb.

the class VcfBurdenGtf method runBurden.

@Override
protected void runBurden(PrintWriter pw, VCFReader vcfReader, VariantContextWriter vcw) throws IOException {
    final SAMSequenceDictionary vcfDict = SequenceDictionaryUtils.extractRequired(vcfReader.getHeader());
    final List<Gene> all_genes;
    try (GtfReader gtfReader = new GtfReader(this.gtfFile)) {
        gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(vcfDict));
        all_genes = gtfReader.getAllGenes().stream().filter(G -> StringUtil.isBlank(this.intergenic_contig) || this.intergenic_contig.equals("*") || this.intergenic_contig.equals(G.getContig())).sorted(new ContigDictComparator(vcfDict).createLocatableComparator()).collect(Collectors.toCollection(ArrayList::new));
    }
    pw.print("#chrom");
    pw.print("\t");
    pw.print("start0");
    pw.print("\t");
    pw.print("end");
    pw.print("\t");
    pw.print("name");
    pw.print("\t");
    pw.print("length");
    pw.print("\t");
    pw.print("gene");
    pw.print("\t");
    pw.print("type");
    pw.print("\t");
    pw.print("strand");
    pw.print("\t");
    pw.print("transcript");
    pw.print("\t");
    pw.print("gene-id");
    pw.print("\t");
    pw.print("intervals");
    pw.print("\t");
    pw.print("p-value");
    pw.print("\t");
    pw.print("affected_alt");
    pw.print("\t");
    pw.print("affected_hom");
    pw.print("\t");
    pw.print("unaffected_alt");
    pw.print("\t");
    pw.print("unaffected_hom");
    pw.print("\t");
    pw.print("variants.count");
    pw.println();
    final List<SimpleInterval> all_intergenic = new ArrayList<>();
    if (!StringUtil.isBlank(this.intergenic_contig)) {
        for (final SAMSequenceRecord ssr : vcfDict.getSequences()) {
            if (!(this.intergenic_contig.equals("*") || this.intergenic_contig.equals(ssr.getSequenceName())))
                continue;
            final BitSet filled = new BitSet(ssr.getSequenceLength() + 2);
            all_genes.stream().filter(G -> G.getContig().equals(ssr.getSequenceName())).forEach(G -> filled.set(G.getStart(), 1 + /* bit set is 0 based */
            Math.min(G.getEnd(), ssr.getSequenceLength())));
            int i = 1;
            while (i < ssr.getSequenceLength()) {
                if (filled.get(i)) {
                    i++;
                    continue;
                }
                int j = i;
                while (j < ssr.getSequenceLength() && !filled.get(j)) {
                    j++;
                }
                all_intergenic.add(new SimpleInterval(ssr.getSequenceName(), i, j));
                i = j + 1;
            }
            all_genes.removeIf(G -> G.getContig().equals(ssr.getSequenceName()));
        }
        all_genes.clear();
    }
    final ProgressFactory.Watcher<Gene> progress = ProgressFactory.newInstance().logger(LOG).dictionary(vcfDict).build();
    /* run genes */
    for (final Gene gene : all_genes) {
        progress.apply(gene);
        final IntervalTree<VariantContext> intervalTree = new IntervalTree<>();
        vcfReader.query(gene).stream().filter(V -> accept(V)).forEach(V -> intervalTree.put(V.getStart(), V.getEnd(), V));
        if (intervalTree.size() == 0)
            continue;
        for (final Transcript transcript : gene.getTranscripts()) {
            final List<SubPartOfTranscript> parts = new ArrayList<>();
            parts.addAll(transcript.getExons().stream().map(R -> new SubPartOfTranscript(R)).collect(Collectors.toList()));
            parts.addAll(transcript.getIntrons().stream().map(R -> new SubPartOfTranscript(R)).collect(Collectors.toList()));
            final int intron_window_size = 1000;
            final int intron_window_shift = 500;
            for (final Intron intron : transcript.getIntrons()) {
                if (intron.getLengthOnReference() <= intron_window_size)
                    continue;
                int start_pos = intron.getStart();
                while (start_pos + intron_window_size <= intron.getEnd()) {
                    int xend = Math.min(intron.getEnd(), start_pos + intron_window_size - 1);
                    int xstart = xend - intron_window_size - 1;
                    parts.add(new SubPartOfTranscript(transcript, intron.getName() + ".Sliding", Collections.singletonList(new SimpleInterval(intron.getContig(), xstart, xend))));
                    start_pos += intron_window_shift;
                }
            }
            for (final UTR utr : transcript.getUTRs()) {
                parts.add(new SubPartOfTranscript(transcript, utr.getName(), utr.getIntervals()));
            }
            if (transcript.getExonCount() > 1) {
                parts.add(new SubPartOfTranscript(transcript, "AllExons", transcript.getExons().stream().map(E -> E.toInterval()).collect(Collectors.toList())));
            }
            if (transcript.hasCodonStartDefined() && transcript.hasCodonStopDefined() && transcript.getAllCds().size() > 1) {
                parts.add(new SubPartOfTranscript(transcript, "AllCds", transcript.getAllCds().stream().map(E -> E.toInterval()).collect(Collectors.toList())));
            }
            final int L = transcript.getTranscriptLength();
            final int[] index2genomic = new int[L];
            int pos = 0;
            for (final Exon exon : transcript.getExons()) {
                for (int i = exon.getStart(); i <= exon.getEnd(); i++) {
                    index2genomic[pos] = i;
                    pos++;
                }
            }
            final int window_size = 200;
            final int window_shift = 100;
            int array_index = 0;
            while (array_index < index2genomic.length) {
                final List<Locatable> intervals = new ArrayList<>();
                int prev_pos = -1;
                int start_pos = index2genomic[array_index];
                int i = 0;
                while (i < window_size && array_index + i < index2genomic.length) {
                    final int curr_pos = index2genomic[array_index + i];
                    if (i > 0 && prev_pos + 1 != curr_pos) {
                        intervals.add(new SimpleInterval(transcript.getContig(), start_pos, prev_pos));
                        start_pos = curr_pos;
                    }
                    prev_pos = curr_pos;
                    i++;
                }
                intervals.add(new SimpleInterval(transcript.getContig(), start_pos, prev_pos));
                parts.add(new SubPartOfTranscript(transcript, "Sliding", intervals));
                array_index += window_shift;
            }
            for (final SubPartOfTranscript part : parts) {
                final List<VariantContext> variants = new ArrayList<>();
                for (final Locatable loc : part.intervals) {
                    Iterator<IntervalTree.Node<VariantContext>> iter = intervalTree.overlappers(loc.getStart(), loc.getEnd());
                    while (iter.hasNext()) variants.add(iter.next().getValue());
                }
                if (variants.isEmpty())
                    continue;
                final FisherResult fisher = runFisher(variants);
                if (fisher.p_value > this.fisherTreshold)
                    continue;
                if (vcw != null) {
                    for (final VariantContext ctx : variants) {
                        vcw.add(new VariantContextBuilder(ctx).attribute(BURDEN_KEY, VCFUtils.escapeInfoField(part.label)).make());
                    }
                }
                pw.print(part.getContig());
                pw.print("\t");
                pw.print(part.getStart() - 1);
                pw.print("\t");
                pw.print(part.getEnd());
                pw.print("\t");
                pw.print(part.label);
                pw.print("\t");
                pw.print(part.getLengthOnReference());
                pw.print("\t");
                pw.print(transcript.getProperties().getOrDefault("gene_name", "."));
                pw.print("\t");
                pw.print(transcript.getProperties().getOrDefault("transcript_type", "."));
                pw.print("\t");
                pw.print(gene.getStrand());
                pw.print("\t");
                pw.print(transcript.getId());
                pw.print("\t");
                pw.print(gene.getId());
                pw.print("\t");
                pw.print(part.intervals.stream().map(R -> String.valueOf(R.getStart()) + "-" + R.getEnd()).collect(Collectors.joining(";")));
                pw.print("\t");
                pw.print(fisher.p_value);
                pw.print("\t");
                pw.print(fisher.affected_alt);
                pw.print("\t");
                pw.print(fisher.affected_hom);
                pw.print("\t");
                pw.print(fisher.unaffected_alt);
                pw.print("\t");
                pw.print(fisher.unaffected_hom);
                pw.print("\t");
                pw.print(variants.size());
                pw.println();
            }
        }
    }
    progress.close();
    final ProgressFactory.Watcher<SimpleInterval> progress2 = ProgressFactory.newInstance().logger(LOG).dictionary(vcfDict).build();
    /**
     * scan intergenics ...
     */
    for (final SimpleInterval intergenic : all_intergenic) {
        progress2.apply(intergenic);
        final int intergenic_window_size = 2000;
        final int intergenic_window_shifr = 100;
        final List<SimpleInterval> parts = new ArrayList<>();
        if (intergenic.getLengthOnReference() <= intergenic_window_size)
            continue;
        int start_pos = intergenic.getStart();
        while (start_pos + intergenic_window_size <= intergenic.getEnd()) {
            int xend = Math.min(intergenic.getEnd(), start_pos + intergenic_window_size - 1);
            int xstart = xend - intergenic_window_size - 1;
            parts.add(new SimpleInterval(intergenic.getContig(), xstart, xend));
            start_pos += intergenic_window_shifr;
        }
        for (final SimpleInterval part : parts) {
            final List<VariantContext> variants = vcfReader.query(part).stream().filter(V -> accept(V)).collect(Collectors.toList());
            if (variants.isEmpty())
                continue;
            final FisherResult fisher = runFisher(variants);
            if (fisher.p_value > this.fisherTreshold)
                continue;
            final String label = "intergenic_" + part.getStart() + "_" + part.getEnd();
            if (vcw != null) {
                for (final VariantContext ctx : variants) {
                    vcw.add(new VariantContextBuilder(ctx).attribute(BURDEN_KEY, VCFUtils.escapeInfoField(label)).make());
                }
            }
            pw.print(part.getContig());
            pw.print("\t");
            pw.print(part.getStart() - 1);
            pw.print("\t");
            pw.print(part.getEnd());
            pw.print("\t");
            pw.print(label);
            pw.print("\t");
            pw.print(part.getLengthOnReference());
            pw.print("\t");
            pw.print(".");
            pw.print("\t");
            pw.print("intergenic");
            pw.print("\t");
            pw.print(".");
            pw.print("\t");
            pw.print(".");
            pw.print("\t");
            pw.print(".");
            pw.print("\t");
            pw.print("" + part.getStart() + "-" + part.getEnd());
            pw.print("\t");
            pw.print(fisher.p_value);
            pw.print("\t");
            pw.print(fisher.affected_alt);
            pw.print("\t");
            pw.print(fisher.affected_hom);
            pw.print("\t");
            pw.print(fisher.unaffected_alt);
            pw.print("\t");
            pw.print(fisher.unaffected_hom);
            pw.print("\t");
            pw.print(variants.size());
            pw.println();
        }
    }
    progress2.close();
}
Also used : VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) JexlVariantPredicate(com.github.lindenb.jvarkit.util.vcf.JexlVariantPredicate) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) Exon(com.github.lindenb.jvarkit.util.bio.structure.Exon) Gene(com.github.lindenb.jvarkit.util.bio.structure.Gene) ArrayList(java.util.ArrayList) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) StringUtil(htsjdk.samtools.util.StringUtil) Path(java.nio.file.Path) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) Locatable(htsjdk.samtools.util.Locatable) Iterator(java.util.Iterator) Predicate(java.util.function.Predicate) UTR(com.github.lindenb.jvarkit.util.bio.structure.UTR) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) VCFReader(htsjdk.variant.vcf.VCFReader) IOException(java.io.IOException) IntervalTree(htsjdk.samtools.util.IntervalTree) Collectors(java.util.stream.Collectors) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) List(java.util.List) Intron(com.github.lindenb.jvarkit.util.bio.structure.Intron) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VariantContext(htsjdk.variant.variantcontext.VariantContext) BitSet(java.util.BitSet) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Collections(java.util.Collections) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) TranscriptInterval(com.github.lindenb.jvarkit.util.bio.structure.TranscriptInterval) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) UTR(com.github.lindenb.jvarkit.util.bio.structure.UTR) Exon(com.github.lindenb.jvarkit.util.bio.structure.Exon) Gene(com.github.lindenb.jvarkit.util.bio.structure.Gene) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) IntervalTree(htsjdk.samtools.util.IntervalTree) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) BitSet(java.util.BitSet) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) Intron(com.github.lindenb.jvarkit.util.bio.structure.Intron) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) Locatable(htsjdk.samtools.util.Locatable)

Example 3 with Intron

use of com.github.lindenb.jvarkit.util.bio.structure.Intron in project jvarkit by lindenb.

the class VcfGtfSplitter method doWork.

@Override
public int doWork(final List<String> args) {
    ArchiveFactory archiveFactory = null;
    PrintWriter manifest = null;
    VCFReader vcfFileReader = null;
    try {
        this.attCleaner = AttributeCleaner.compile(this.xannotatePattern);
        for (final String s : featuresString.split("[;, ]")) {
            if (StringUtils.isBlank(s))
                continue;
            if (s.equals("cds")) {
                use_cds = true;
            } else if (s.equals("intron")) {
                use_cds = true;
            } else if (s.equals("exon")) {
                use_exon = true;
            } else if (s.equals("stop")) {
                use_stop = true;
            } else if (s.equals("start")) {
                use_start = true;
            } else if (s.equals("transcript")) {
                use_exon = true;
                use_intron = true;
            } else if (s.equals("utr5")) {
                use_utr5 = true;
            } else if (s.equals("utr3")) {
                use_utr3 = true;
            } else if (s.equals("utr")) {
                use_utr3 = true;
                use_utr5 = true;
            } else if (s.equals("upstream")) {
                use_upstream = true;
            } else if (s.equals("downstream")) {
                use_downstream = true;
            } else if (s.equals("splice")) {
                use_splice = true;
            } else if (s.equals("cds_utr5")) {
                use_cds_utr5 = true;
            } else if (s.equals("cds_utr3")) {
                use_cds_utr3 = true;
            } else if (s.equals("cds_utr")) {
                use_cds_utr3 = true;
                use_cds_utr5 = true;
            } else {
                LOG.error("unknown code " + s + " in " + this.featuresString);
                return -1;
            }
        }
        final Path tmpVcf = Files.createTempFile("tmp.", (use_bcf ? FileExtensions.BCF : FileExtensions.COMPRESSED_VCF));
        String input = oneAndOnlyOneFile(args);
        vcfFileReader = VCFReaderFactory.makeDefault().open(Paths.get(input), true);
        final VCFHeader header1 = vcfFileReader.getHeader();
        final SAMSequenceDictionary dict = header1.getSequenceDictionary();
        if (dict == null && this.use_bcf) {
            throw new JvarkitException.VcfDictionaryMissing(input);
        }
        if (dict != null && !limitToContigs.isEmpty()) {
            final ContigNameConverter ctgNameConverter = ContigNameConverter.fromOneDictionary(dict);
            final Set<String> set2 = new HashSet<>(this.limitToContigs.size());
            for (final String ctg : this.limitToContigs) {
                final String ctg2 = ctgNameConverter.apply(ctg);
                if (StringUtils.isBlank(ctg2)) {
                    LOG.error(JvarkitException.ContigNotFoundInDictionary.getMessage(ctg, dict));
                    return -1;
                }
                set2.add(ctg2);
            }
            this.limitToContigs = set2;
        }
        final List<Gene> all_genes;
        try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
            final Comparator<Gene> cmp;
            if (dict != null) {
                gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
                cmp = new ContigDictComparator(dict).createLocatableComparator();
            } else {
                cmp = (A, B) -> {
                    final int i = A.getContig().compareTo(B.getContig());
                    if (i != 0)
                        return i;
                    return Integer.compare(A.getStart(), B.getStart());
                };
            }
            all_genes = gtfReader.getAllGenes().stream().filter(G -> {
                if (this.protein_coding_only && !"protein_coding".equals(G.getGeneBiotype()))
                    return false;
                if (this.limitToContigs.isEmpty())
                    return true;
                return this.limitToContigs.contains(G.getContig());
            }).sorted(cmp).collect(Collectors.toList());
        }
        archiveFactory = ArchiveFactory.open(this.outputFile);
        archiveFactory.setCompressionLevel(0);
        manifest = new PrintWriter(this.manifestFile == null ? new NullOuputStream() : IOUtils.openPathForWriting(manifestFile));
        manifest.println("#chrom\tstart\tend\tGene-Id\tGene-Name\tGene-Biotype\tTranscript-Id\tpath\tCount_Variants");
        if (this.split_by_transcript) {
            final Iterator<Transcript> triter = all_genes.stream().flatMap(G -> G.getTranscripts().stream()).iterator();
            while (triter.hasNext()) {
                final Transcript tr = triter.next();
                final AbstractSplitter splitter = new TranscriptSplitter(tr);
                this.split(splitter, vcfFileReader, header1, dict, archiveFactory, tmpVcf, manifest);
            }
        } else {
            for (Gene gene : all_genes) {
                final AbstractSplitter splitter = new GeneSplitter(gene);
                this.split(splitter, vcfFileReader, header1, dict, archiveFactory, tmpVcf, manifest);
            }
        }
        vcfFileReader.close();
        vcfFileReader = null;
        manifest.flush();
        manifest.close();
        manifest = null;
        archiveFactory.close();
        Files.deleteIfExists(tmpVcf);
        return RETURN_OK;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(vcfFileReader);
        CloserUtil.close(archiveFactory);
        CloserUtil.close(manifest);
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) AttributeCleaner(com.github.lindenb.jvarkit.variant.variantcontext.AttributeCleaner) Parameter(com.beust.jcommander.Parameter) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) VCFHeader(htsjdk.variant.vcf.VCFHeader) TabixIndexCreator(htsjdk.tribble.index.tabix.TabixIndexCreator) Gene(com.github.lindenb.jvarkit.util.bio.structure.Gene) VariantContextWriterBuilder(htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) OutputStream(java.io.OutputStream) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) Locatable(htsjdk.samtools.util.Locatable) Iterator(java.util.Iterator) Files(java.nio.file.Files) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFReader(htsjdk.variant.vcf.VCFReader) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) File(java.io.File) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) SimplePosition(com.github.lindenb.jvarkit.samtools.util.SimplePosition) List(java.util.List) Intron(com.github.lindenb.jvarkit.util.bio.structure.Intron) Paths(java.nio.file.Paths) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) FileExtensions(htsjdk.samtools.util.FileExtensions) Options(htsjdk.variant.variantcontext.writer.Options) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VariantContext(htsjdk.variant.variantcontext.VariantContext) Comparator(java.util.Comparator) TabixFormat(htsjdk.tribble.index.tabix.TabixFormat) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Gene(com.github.lindenb.jvarkit.util.bio.structure.Gene) VCFReader(htsjdk.variant.vcf.VCFReader) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) VCFHeader(htsjdk.variant.vcf.VCFHeader) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) PrintWriter(java.io.PrintWriter) HashSet(java.util.HashSet) Path(java.nio.file.Path) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) IOException(java.io.IOException) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader)

Example 4 with Intron

use of com.github.lindenb.jvarkit.util.bio.structure.Intron in project jvarkit by lindenb.

the class GtfRetroCopy method doWork.

@Override
public int doWork(final List<String> args) {
    VCFReader vcfFileReader = null;
    VariantContextWriter vcw0 = null;
    try {
        /* load the reference genome */
        /* create a contig name converter from the REF */
        final Set<String> knownGeneIds;
        if (this.knownPath != null) {
            try (BufferedReader br = IOUtils.openPathForBufferedReading(this.knownPath)) {
                knownGeneIds = br.lines().filter(L -> !StringUtils.isBlank(L)).map(S -> S.trim()).filter(S -> !(S.equals("-") || S.equals(".") || S.startsWith("#"))).collect(Collectors.toSet());
            }
        } else {
            knownGeneIds = Collections.emptySet();
        }
        // open the sam file
        final String input = oneAndOnlyOneFile(args);
        vcfFileReader = VCFReaderFactory.makeDefault().open(Paths.get(input), true);
        final VCFHeader header = vcfFileReader.getHeader();
        final SAMSequenceDictionary dict = header.getSequenceDictionary();
        final Comparator<String> contigCmp = dict == null ? (A, B) -> A.compareTo(B) : new ContigDictComparator(dict);
        final Comparator<Gene> geneCmp = (A, B) -> {
            int i = contigCmp.compare(A.getContig(), B.getContig());
            if (i != 0)
                return i;
            i = Integer.compare(A.getStart(), B.getStart());
            if (i != 0)
                return i;
            return Integer.compare(A.getEnd(), B.getEnd());
        };
        final GtfReader gtfReader = new GtfReader(this.gtfPath);
        if (dict != null && !dict.isEmpty()) {
            this.writingVcf.dictionary(dict);
            gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
        }
        final List<Gene> genes = gtfReader.getAllGenes().stream().filter(G -> G.getTranscripts().stream().count() > 0L).filter(G -> G.getTranscripts().stream().anyMatch(T -> T.getIntronCount() >= this.min_intron_count)).sorted(geneCmp).collect(Collectors.toList());
        gtfReader.close();
        /**
         * build vcf header
         */
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_QUALITY_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY, true));
        metaData.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Variation type"));
        metaData.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Variation Length"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_BOUNDS, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Introns boundaries"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_SIZES, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Introns sizes"));
        metaData.add(new VCFFilterHeaderLine(ATT_NOT_ALL_INTRONS, "Not all introns were found retrocopied"));
        for (final String att : ENSEMBL_TRANSCRIPT_ATTS) {
            metaData.add(new VCFInfoHeaderLine(att, 1, VCFHeaderLineType.String, "Value for the attribute '" + att + "' in the gtf"));
        }
        // metaData.add(new VCFFormatHeaderLine(ATT_COUNT_SUPPORTING_READS, 2,VCFHeaderLineType.Integer,"Count supporting reads [intron-left/intron-right]"));
        // metaData.add(new VCFInfoHeaderLine(ATT_RETRO_DESC, VCFHeaderLineCount.UNBOUNDED,VCFHeaderLineType.String,
        // "Retrocopy attributes: transcript-id|strand|exon-left|exon-left-bases|exon-right-bases|exon-right"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns for the Transcript"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns found retrocopied for the transcript"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_FRACTION, 1, VCFHeaderLineType.Float, "Fraction of introns found retrocopied for the transcript"));
        metaData.add(new VCFFilterHeaderLine(ATT_NOT_ALL_INTRONS, "Not all introns were found retrocopied"));
        metaData.add(new VCFFilterHeaderLine(ATT_KNOWN, "Known RetroGenes. " + (this.knownPath == null ? "" : " Source: " + this.knownPath)));
        final VCFHeader header2 = new VCFHeader(header);
        metaData.stream().forEach(H -> header2.addMetaDataLine(H));
        JVarkitVersion.getInstance().addMetaData(this, header2);
        final Allele ref = Allele.create((byte) 'N', true);
        final Allele alt = Allele.create("<RETROCOPY>", false);
        /* open vcf for writing*/
        vcw0 = this.writingVcf.open(this.outputFile);
        vcw0.writeHeader(header2);
        final ProgressFactory.Watcher<Gene> progress = ProgressFactory.newInstance().logger(LOG).dictionary(dict).build();
        for (final Gene gene : genes) {
            progress.apply(gene);
            final List<VariantContext> variants = new ArrayList<>();
            final CloseableIterator<VariantContext> iter2 = vcfFileReader.query(gene.getContig(), gene.getStart(), gene.getEnd());
            while (iter2.hasNext()) {
                final VariantContext ctx = iter2.next();
                // SNV
                if (ctx.getStart() == ctx.getEnd())
                    continue;
                StructuralVariantType svType = ctx.getStructuralVariantType();
                if (StructuralVariantType.BND.equals(svType))
                    continue;
                if (StructuralVariantType.INS.equals(svType))
                    continue;
                variants.add(ctx);
            }
            iter2.close();
            if (variants.isEmpty())
                continue;
            for (final Transcript transcript : gene.getTranscripts()) {
                if (!transcript.hasIntron())
                    continue;
                if (transcript.getIntronCount() < this.min_intron_count)
                    continue;
                final Counter<String> samples = new Counter<>();
                for (final Intron intron : transcript.getIntrons()) {
                    for (final VariantContext ctx : variants) {
                        if (!isWithinDistance(intron.getStart(), ctx.getStart()))
                            continue;
                        if (!isWithinDistance(intron.getEnd(), ctx.getEnd()))
                            continue;
                        if (ctx.hasGenotypes()) {
                            for (final Genotype g : ctx.getGenotypes()) {
                                if (g.isNoCall() || g.isHomRef())
                                    continue;
                                samples.incr(g.getSampleName());
                            }
                        } else {
                            samples.incr("*");
                        }
                    }
                // end iter2
                }
                // end intron
                final long max_count = samples.stream().mapToLong(E -> E.getValue()).max().orElse(0L);
                if (max_count == 0)
                    continue;
                if (this.only_all_introns && max_count != transcript.getIntronCount())
                    continue;
                // ok good candidate
                final VariantContextBuilder vcb = new VariantContextBuilder();
                vcb.chr(transcript.getContig());
                vcb.start(transcript.getStart());
                vcb.stop(transcript.getEnd());
                switch(this.idKey) {
                    case gene_name:
                        final String gn = transcript.getGene().getGeneName();
                        vcb.id(StringUtils.isBlank(gn) ? transcript.getId() : gn);
                        break;
                    case gene_id:
                        vcb.id(transcript.getGene().getId());
                        break;
                    case transcript_id:
                        vcb.id(transcript.getId());
                        break;
                    default:
                        throw new IllegalStateException();
                }
                final List<Allele> alleles = Arrays.asList(ref, alt);
                // vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY,2);
                // vcb.attribute(VCFConstants.ALLELE_COUNT_KEY,1);
                // vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY,0.5);
                vcb.attribute(VCFConstants.SVTYPE, "DEL");
                vcb.attribute(VCFConstants.END_KEY, transcript.getEnd());
                vcb.attribute("SVLEN", transcript.getLengthOnReference());
                vcb.attribute(ATT_INTRONS_BOUNDS, transcript.getIntrons().stream().map(S -> "" + S.getStart() + "-" + S.getEnd()).collect(Collectors.toList()));
                vcb.attribute(ATT_INTRONS_SIZES, transcript.getIntrons().stream().mapToInt(S -> S.getLengthOnReference()).toArray());
                for (final String att : ENSEMBL_TRANSCRIPT_ATTS) {
                    final String v = transcript.getProperties().get(att);
                    if (StringUtils.isBlank(v))
                        continue;
                    vcb.attribute(att, v);
                }
                vcb.alleles(alleles);
                boolean pass_filter = true;
                // introns sequences
                vcb.attribute(ATT_INTRONS_CANDIDATE_COUNT, max_count);
                vcb.attribute(ATT_INTRONS_COUNT, transcript.getIntronCount());
                vcb.attribute(ATT_INTRONS_CANDIDATE_FRACTION, max_count / (float) transcript.getIntronCount());
                if (transcript.getIntronCount() != max_count) {
                    vcb.filter(ATT_NOT_ALL_INTRONS);
                    pass_filter = false;
                }
                if (knownGeneIds.contains(transcript.getGene().getId())) {
                    vcb.filter(ATT_KNOWN);
                    pass_filter = false;
                }
                if (header.hasGenotypingData()) {
                    final List<Genotype> genotypes = new ArrayList<>();
                    for (final String sn : header.getSampleNamesInOrder()) {
                        final List<Allele> gtalleles;
                        if (samples.count(sn) == 0L) {
                            gtalleles = Arrays.asList(ref, ref);
                        } else {
                            gtalleles = Arrays.asList(ref, alt);
                        }
                        final GenotypeBuilder gb = new GenotypeBuilder(sn, gtalleles);
                        genotypes.add(gb.make());
                    }
                    vcb.genotypes(genotypes);
                }
                if (pass_filter)
                    vcb.passFilters();
                vcw0.add(vcb.make());
            }
        }
        progress.close();
        vcw0.close();
        vcfFileReader.close();
        vcfFileReader = null;
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(vcfFileReader);
        CloserUtil.close(vcw0);
    }
}
Also used : WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) VCFHeader(htsjdk.variant.vcf.VCFHeader) Gene(com.github.lindenb.jvarkit.util.bio.structure.Gene) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) VCFConstants(htsjdk.variant.vcf.VCFConstants) Counter(com.github.lindenb.jvarkit.util.Counter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) VCFReader(htsjdk.variant.vcf.VCFReader) Set(java.util.Set) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) Collectors(java.util.stream.Collectors) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) List(java.util.List) Intron(com.github.lindenb.jvarkit.util.bio.structure.Intron) Paths(java.nio.file.Paths) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) StructuralVariantType(htsjdk.variant.variantcontext.StructuralVariantType) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) BufferedReader(java.io.BufferedReader) Comparator(java.util.Comparator) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) Collections(java.util.Collections) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Counter(com.github.lindenb.jvarkit.util.Counter) Gene(com.github.lindenb.jvarkit.util.bio.structure.Gene) VCFReader(htsjdk.variant.vcf.VCFReader) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) Intron(com.github.lindenb.jvarkit.util.bio.structure.Intron) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) BufferedReader(java.io.BufferedReader) StructuralVariantType(htsjdk.variant.variantcontext.StructuralVariantType)

Example 5 with Intron

use of com.github.lindenb.jvarkit.util.bio.structure.Intron in project jvarkit by lindenb.

the class KnownRetroCopy method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, VCFIterator iterin, VariantContextWriter out) {
    try {
        /* load the reference genome */
        /* create a contig name converter from the REF */
        final Set<String> knownGeneIds;
        if (this.knownPath != null) {
            try (BufferedReader br = IOUtils.openPathForBufferedReading(this.knownPath)) {
                knownGeneIds = br.lines().filter(L -> !StringUtils.isBlank(L)).map(S -> S.trim()).filter(S -> !(S.equals("-") || S.equals(".") || S.startsWith("#"))).collect(Collectors.toSet());
            }
        } else {
            knownGeneIds = Collections.emptySet();
        }
        // open the sam file
        final VCFHeader header = iterin.getHeader();
        final SAMSequenceDictionary dict = header.getSequenceDictionary();
        final IntervalTreeMap<List<Intron>> intronMap = new IntervalTreeMap<>();
        final GtfReader gtfReader = new GtfReader(this.gtfPath);
        if (dict != null && !dict.isEmpty())
            gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
        gtfReader.getAllGenes().stream().filter(G -> G.getTranscripts().stream().count() > 0L).filter(G -> G.getTranscripts().stream().anyMatch(T -> T.getIntronCount() >= this.min_intron_count)).flatMap(G -> G.getTranscripts().stream()).flatMap(G -> G.getIntrons().stream()).forEach(INTRON -> {
            List<Intron> introns = intronMap.get(INTRON);
            if (introns == null) {
                introns = new ArrayList<>();
                intronMap.put(INTRON.toInterval(), introns);
            }
            introns.add(INTRON);
        });
        gtfReader.close();
        /**
         * build vcf header
         */
        final VCFHeader header2 = new VCFHeader(header);
        header2.addMetaDataLine(new VCFFilterHeaderLine(ATT_FILTER_INTRON, "variant could be a deleted intron from a retrocopy"));
        header2.addMetaDataLine(new VCFFilterHeaderLine(ATT_FILTER_KNOWN, "variant could be a deleted intron from a known retrocopy"));
        header2.addMetaDataLine(new VCFInfoHeaderLine(ATT_RETROCOPY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Identifiers for the retrocopies."));
        JVarkitVersion.getInstance().addMetaData(this, header2);
        out.writeHeader(header2);
        final ProgressFactory.Watcher<VariantContext> progress = ProgressFactory.newInstance().logger(LOG).dictionary(dict).build();
        while (iterin.hasNext()) {
            final VariantContext ctx = progress.apply(iterin.next());
            if (ctx.getStart() == ctx.getEnd()) {
                out.add(ctx);
                continue;
            }
            final String svType = ctx.getAttributeAsString(VCFConstants.SVTYPE, "");
            if (svType.equals("BND") || svType.equals("INS")) {
                out.add(ctx);
                continue;
            }
            boolean known_flag = false;
            final Set<String> retrocopy_identifiers = new TreeSet<>();
            for (final Intron intron : intronMap.getOverlapping(ctx).stream().flatMap(L -> L.stream()).filter(I -> isWithinDistance(I.getStart(), ctx.getStart())).filter(I -> isWithinDistance(I.getEnd(), ctx.getEnd())).collect(Collectors.toList())) {
                if (knownGeneIds.contains(intron.getTranscript().getGene().getId())) {
                    known_flag = true;
                }
                retrocopy_identifiers.add(VCFUtils.escapeInfoField(intron.getTranscript().getGene().getId()));
                retrocopy_identifiers.add(VCFUtils.escapeInfoField(intron.getTranscript().getId()));
                String s = intron.getTranscript().getGene().getProperties().getOrDefault("gene_name", "");
                if (!StringUtils.isBlank(s))
                    retrocopy_identifiers.add(VCFUtils.escapeInfoField(s));
                s = intron.getTranscript().getProperties().getOrDefault("transcript_name", "");
                if (!StringUtils.isBlank(s))
                    retrocopy_identifiers.add(VCFUtils.escapeInfoField(s));
            }
            if (retrocopy_identifiers.isEmpty()) {
                out.add(ctx);
                continue;
            }
            final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
            vcb.filter(ATT_FILTER_INTRON);
            if (known_flag)
                vcb.filter(ATT_FILTER_KNOWN);
            vcb.attribute(ATT_RETROCOPY, new ArrayList<>(retrocopy_identifiers));
            out.add(vcb.make());
        }
        progress.close();
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
    }
}
Also used : VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFIterator(htsjdk.variant.vcf.VCFIterator) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Path(java.nio.file.Path) VCFConstants(htsjdk.variant.vcf.VCFConstants) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) Set(java.util.Set) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) Collectors(java.util.stream.Collectors) File(java.io.File) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) List(java.util.List) Intron(com.github.lindenb.jvarkit.util.bio.structure.Intron) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) BufferedReader(java.io.BufferedReader) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) Collections(java.util.Collections) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) Intron(com.github.lindenb.jvarkit.util.bio.structure.Intron) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) TreeSet(java.util.TreeSet) BufferedReader(java.io.BufferedReader) ArrayList(java.util.ArrayList) List(java.util.List) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap)

Aggregations

Parameter (com.beust.jcommander.Parameter)5 ContigNameConverter (com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter)5 GtfReader (com.github.lindenb.jvarkit.util.bio.structure.GtfReader)5 Intron (com.github.lindenb.jvarkit.util.bio.structure.Intron)5 Program (com.github.lindenb.jvarkit.util.jcommander.Program)5 Logger (com.github.lindenb.jvarkit.util.log.Logger)5 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)5 VariantContext (htsjdk.variant.variantcontext.VariantContext)5 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)5 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)4 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)4 Gene (com.github.lindenb.jvarkit.util.bio.structure.Gene)4 Transcript (com.github.lindenb.jvarkit.util.bio.structure.Transcript)4 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)4 ContigDictComparator (com.github.lindenb.jvarkit.util.samtools.ContigDictComparator)4 VCFHeader (htsjdk.variant.vcf.VCFHeader)4 VCFReader (htsjdk.variant.vcf.VCFReader)4 Path (java.nio.file.Path)4 List (java.util.List)4 Collectors (java.util.stream.Collectors)4