Search in sources :

Example 1 with UTR

use of com.github.lindenb.jvarkit.util.bio.structure.UTR in project jvarkit by lindenb.

the class GtfUpstreamOrf method doWork.

@Override
public int doWork(final List<String> args) {
    GtfReader gtfReader = null;
    PrintWriter pw = null;
    try {
        this.indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx);
        final SAMSequenceDictionary refDict = SequenceDictionaryUtils.extractRequired(this.indexedFastaSequenceFile);
        this.refCtgNameConverter = ContigNameConverter.fromOneDictionary(refDict);
        final ContigDictComparator ctgDictComparator = new ContigDictComparator(refDict);
        final String input = oneFileOrNull(args);
        gtfReader = input == null ? new GtfReader(stdin()) : new GtfReader(input);
        gtfReader.setContigNameConverter(this.refCtgNameConverter);
        final List<Gene> genes = gtfReader.getAllGenes().stream().filter(G -> G.hasStrand()).sorted((A, B) -> {
            int i = ctgDictComparator.compare(A.getContig(), B.getContig());
            if (i != 0)
                return i;
            i = Integer.compare(A.getStart(), B.getStart());
            if (i != 0)
                return i;
            return Integer.compare(A.getEnd(), B.getEnd());
        }).collect(Collectors.toList());
        gtfReader.close();
        gtfReader = null;
        pw = super.openPathOrStdoutAsPrintWriter(this.outputFile);
        for (final KozakSequence.Strength f : KozakSequence.Strength.values()) {
            pw.println("#kozak." + f.name() + "=" + kozakStrengthToScore(f));
        }
        final String gtfSource = getProgramName().toLowerCase();
        for (final SAMSequenceRecord ssr : refDict.getSequences()) {
            pw.println("##contig " + ssr.getSequenceName() + ": length:" + ssr.getSequenceLength());
        }
        pw.println("#" + gtfSource + ":" + JVarkitVersion.getInstance().toString());
        if (!StringUtils.isBlank(input)) {
            pw.println("#gtf:" + input);
        }
        final ProgressFactory.Watcher<Gene> progress = ProgressFactory.newInstance().dictionary(refDict).logger(LOG).build();
        for (final Gene gene : genes) {
            progress.apply(gene);
            /* new reference sequence */
            if (this.genomicSequence == null || !this.genomicSequence.getChrom().equals(gene.getContig())) {
                this.genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, gene.getContig());
            }
            final List<RNASequence> rnas = gene.getTranscripts().stream().filter(T -> T.isCoding()).filter(T -> T.hasStrand()).filter(T -> T.hasCodonStartDefined()).filter(T -> T.getTranscriptUTR5().isPresent()).map(T -> new RNASequence(T)).collect(Collectors.toList());
            if (rnas.isEmpty())
                continue;
            final Set<OpenReadingFrame> orfs = rnas.stream().flatMap(R -> R.getUpstreamOpenReadingFrames().stream()).collect(Collectors.toSet());
            if (orfs.isEmpty())
                continue;
            boolean gene_printed = false;
            for (final OpenReadingFrame uORF : orfs) {
                /* is there any other RNA containing this uORF ?*/
                if (rnas.stream().filter(other -> !other.getTranscript().getId().equals(uORF.mRNA.getTranscript().getId())).anyMatch(other -> {
                    // other must have atg in frame and ATG before the observed one
                    final int other_atg0 = other.getATG0InRNA();
                    if (uORF.in_rna_atg0 % 3 != other_atg0 % 3)
                        return false;
                    return uORF.in_rna_atg0 >= other_atg0;
                }))
                    continue;
                final String transcript_id = uORF.getTranscript().getId() + ".uorf" + (1 + uORF.in_rna_atg0);
                if (!gene_printed) {
                    gene_printed = true;
                    pw.print(gene.getContig());
                    pw.print("\t");
                    // source
                    pw.print(gtfSource);
                    pw.print("\t");
                    pw.print("gene");
                    pw.print("\t");
                    // start
                    pw.print(gene.getStart());
                    pw.print("\t");
                    // end
                    pw.print(gene.getEnd());
                    pw.print("\t");
                    // score
                    pw.print(".");
                    pw.print("\t");
                    // strand
                    pw.print(gene.getStrand());
                    pw.print("\t");
                    // phase
                    pw.print(".");
                    pw.print("\t");
                    pw.print(keyvalue("gene_id", gene.getId()));
                    pw.println();
                }
                // TRANSCRIPT
                final UTR utr_5_prime = uORF.getTranscript().getTranscriptUTR5().get();
                pw.print(gene.getContig());
                pw.print("\t");
                pw.print(gtfSource);
                pw.print("\t");
                pw.print("transcript");
                pw.print("\t");
                if (gene.isPositiveStrand()) {
                    pw.print(uORF.mRNA.mrnaIndex0ToGenomic0[uORF.in_rna_atg0] + 1);
                    pw.print("\t");
                    // pw.print(transcriptSequence.mrnaIndex0ToBase[uORF.in_rna_stop0]+1);
                    if (uORF.in_rna_stop0 == NPOS) {
                        pw.print(uORF.mRNA.getTranscript().getTxEnd());
                    } else {
                        pw.print(uORF.mRNA.mrnaIndex0ToGenomic0[uORF.in_rna_stop0] + 1);
                    }
                } else {
                    if (uORF.in_rna_stop0 == NPOS) {
                        pw.print(uORF.mRNA.getTranscript().getTxStart());
                    } else {
                        pw.print(uORF.mRNA.mrnaIndex0ToGenomic0[uORF.in_rna_stop0] + 1);
                    }
                    pw.print("\t");
                    pw.print(uORF.mRNA.mrnaIndex0ToGenomic0[uORF.in_rna_atg0] + 1);
                }
                pw.print("\t");
                // score
                pw.print(kozakStrengthToScore(uORF.kozak.getStrength()));
                pw.print("\t");
                // strand
                pw.print(gene.getStrand());
                pw.print("\t");
                // phase
                pw.print("0");
                pw.print("\t");
                pw.print(keyvalue("gene_id", gene.getId()));
                pw.print(keyvalue("transcript_id", transcript_id));
                pw.print(keyvalue("transcript_biotype", "uORF"));
                pw.print(keyvalue("kozak-seq", uORF.kozak.getString()));
                pw.print(keyvalue("kozak-strength", uORF.kozak.getStrength()));
                pw.print(keyvalue("translation", uORF.peptide));
                pw.print(keyvalue("uORF-atg-in-frame-with-transcript-atg", uORF.uorf_atg_in_frame));
                pw.print(keyvalue("utr", utr_5_prime.toString() + " " + utr_5_prime.getStart() + "-" + utr_5_prime.getEnd()));
                pw.println();
                // Exon
                for (final Exon exon : uORF.getTranscript().getExons()) {
                    pw.print(exon.getContig());
                    pw.print("\t");
                    pw.print(gtfSource);
                    pw.print("\t");
                    pw.print("exon");
                    pw.print("\t");
                    pw.print(exon.getStart());
                    pw.print("\t");
                    pw.print(exon.getEnd());
                    pw.print("\t");
                    // score
                    pw.print(kozakStrengthToScore(uORF.kozak.getStrength()));
                    pw.print("\t");
                    // strand
                    pw.print(exon.getStrand());
                    pw.print("\t");
                    // phase
                    pw.print(0);
                    pw.print("\t");
                    pw.print(keyvalue("gene_id", gene.getId()));
                    pw.print(keyvalue("transcript_id", transcript_id));
                    pw.println();
                }
                final List<Interval> startBlocks = uORF.mRNA.getCodonBlocks(uORF.in_rna_atg0, uORF.in_rna_atg0 + 1, uORF.in_rna_atg0 + 2);
                final List<Interval> stopBlocks = uORF.in_rna_stop0 != NPOS ? uORF.mRNA.getCodonBlocks(uORF.in_rna_stop0, uORF.in_rna_stop0 + 1, uORF.in_rna_stop0 + 2) : Collections.emptyList();
                // CDS
                if (!stopBlocks.isEmpty()) {
                    final int cdsStart = startBlocks.stream().mapToInt(B -> B.getStart()).min().orElseThrow(IllegalStateException::new);
                    final int cdsEnd = stopBlocks.stream().mapToInt(B -> B.getEnd()).max().orElseThrow(IllegalStateException::new);
                    for (final Exon exon : uORF.getTranscript().getExons()) {
                        if (exon.getEnd() < cdsStart)
                            continue;
                        if (exon.getStart() > cdsEnd)
                            break;
                        pw.print(exon.getContig());
                        pw.print("\t");
                        pw.print(gtfSource);
                        pw.print("\t");
                        pw.print("CDS");
                        pw.print("\t");
                        pw.print(Math.max(cdsStart, exon.getStart()));
                        pw.print("\t");
                        pw.print(Math.min(cdsEnd, exon.getEnd()));
                        pw.print("\t");
                        // score
                        pw.print(kozakStrengthToScore(uORF.kozak.getStrength()));
                        pw.print("\t");
                        // strand
                        pw.print(exon.getStrand());
                        pw.print("\t");
                        // phase
                        pw.print(uORF.getFrameAt(Math.max(cdsStart, exon.getStart())));
                        pw.print("\t");
                        pw.print(keyvalue("gene_id", gene.getId()));
                        pw.print(keyvalue("transcript_id", transcript_id));
                        pw.println();
                    }
                }
                // CODON START
                for (final Interval startc : startBlocks) {
                    PARANOID.assertLe(startc.getStart(), startc.getEnd());
                    pw.print(startc.getContig());
                    pw.print("\t");
                    pw.print(gtfSource);
                    pw.print("\t");
                    pw.print("start_codon");
                    pw.print("\t");
                    pw.print(startc.getStart());
                    pw.print("\t");
                    pw.print(startc.getEnd());
                    pw.print("\t");
                    // score
                    pw.print(kozakStrengthToScore(uORF.kozak.getStrength()));
                    pw.print("\t");
                    // strand
                    pw.print(gene.getStrand());
                    pw.print("\t");
                    // phase
                    pw.print(uORF.getFrameAt(startc.getStart()));
                    pw.print("\t");
                    pw.print(keyvalue("gene_id", gene.getId()));
                    pw.print(keyvalue("transcript_id", transcript_id));
                    pw.print(keyvalue("distance-mrna-atg", uORF.mRNA.getATG0InRNA() - uORF.in_rna_atg0));
                    pw.print(keyvalue("pos0-in-mrna", uORF.in_rna_atg0));
                    pw.print(keyvalue("spliced", startBlocks.size() > 1));
                    pw.println();
                }
                // CODON END
                for (final Interval stopc : stopBlocks) /* might be empty */
                {
                    PARANOID.assertLe(stopc.getStart(), stopc.getEnd());
                    pw.print(stopc.getContig());
                    pw.print("\t");
                    pw.print(gtfSource);
                    pw.print("\t");
                    pw.print("stop_codon");
                    pw.print("\t");
                    pw.print(stopc.getStart());
                    pw.print("\t");
                    pw.print(stopc.getEnd());
                    pw.print("\t");
                    // score
                    pw.print(kozakStrengthToScore(uORF.kozak.getStrength()));
                    pw.print("\t");
                    // strand
                    pw.print(gene.getStrand());
                    pw.print("\t");
                    // phase
                    pw.print(uORF.getFrameAt(stopc.getStart()));
                    pw.print("\t");
                    pw.print(keyvalue("gene_id", gene.getId()));
                    pw.print(keyvalue("transcript_id", transcript_id));
                    pw.print(keyvalue("spliced", stopBlocks.size() > 1));
                    pw.println();
                }
            }
        }
        progress.close();
        pw.flush();
        pw.close();
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(this.indexedFastaSequenceFile);
    }
}
Also used : Arrays(java.util.Arrays) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) AbstractCharSequence(com.github.lindenb.jvarkit.lang.AbstractCharSequence) Exon(com.github.lindenb.jvarkit.util.bio.structure.Exon) AcidNucleics(com.github.lindenb.jvarkit.util.bio.AcidNucleics) HashMap(java.util.HashMap) Gene(com.github.lindenb.jvarkit.util.bio.structure.Gene) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Interval(htsjdk.samtools.util.Interval) Map(java.util.Map) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) GeneticCode(com.github.lindenb.jvarkit.util.bio.GeneticCode) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) UTR(com.github.lindenb.jvarkit.util.bio.structure.UTR) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) Set(java.util.Set) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) Collectors(java.util.stream.Collectors) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) ReferenceSequenceFileFactory(htsjdk.samtools.reference.ReferenceSequenceFileFactory) List(java.util.List) KozakSequence(com.github.lindenb.jvarkit.util.bio.KozakSequence) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) Paranoid(com.github.lindenb.jvarkit.lang.Paranoid) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Collections(java.util.Collections) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) UTR(com.github.lindenb.jvarkit.util.bio.structure.UTR) Exon(com.github.lindenb.jvarkit.util.bio.structure.Exon) Gene(com.github.lindenb.jvarkit.util.bio.structure.Gene) KozakSequence(com.github.lindenb.jvarkit.util.bio.KozakSequence) PrintWriter(java.io.PrintWriter) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) Interval(htsjdk.samtools.util.Interval)

Example 2 with UTR

use of com.github.lindenb.jvarkit.util.bio.structure.UTR in project jvarkit by lindenb.

the class VCFPredictions method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, VCFIterator r, VariantContextWriter w) {
    try {
        this.referenceGenome = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidxPath);
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.referenceGenome);
        final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
        loadGtf(dict);
        final VCFHeader header = r.getHeader();
        final VCFHeader h2 = new VCFHeader(header);
        JVarkitVersion.getInstance().addMetaData(this, h2);
        switch(this.outputSyntax) {
            case Vep:
                {
                    h2.addMetaDataLine(new VCFInfoHeaderLine("CSQ", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Consequence type as predicted by VEP" + ". Format: Allele|Feature|Feature_type|Consequence|CDS_position|Protein_position|Amino_acids|Codons"));
                    break;
                }
            case SnpEff:
                {
                    h2.addMetaDataLine(new VCFInfoHeaderLine("ANN", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'"));
                    break;
                }
            default:
                {
                    final StringBuilder format = new StringBuilder();
                    for (FORMAT1 f : FORMAT1.values()) {
                        if (format.length() > 0)
                            format.append("|");
                        format.append(f.name());
                    }
                    h2.addMetaDataLine(new VCFInfoHeaderLine(TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Prediction from " + getClass().getSimpleName() + ". Format: " + format));
                    break;
                }
        }
        w.writeHeader(h2);
        final RNASequenceFactory rnaSeqFactory = new RNASequenceFactory();
        rnaSeqFactory.setContigToGenomicSequence(S -> getGenomicSequence(S));
        while (r.hasNext()) {
            final VariantContext ctx = r.next();
            final String normalizedContig = contigNameConverter.apply(ctx.getContig());
            if (StringUtil.isBlank(normalizedContig)) {
                w.add(ctx);
                continue;
            }
            final List<Transcript> transcripts = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, ctx.getStart(), ctx.getEnd())).stream().flatMap(L -> L.stream()).collect(Collectors.toList());
            final List<Annotation> all_annotations = new ArrayList<>();
            final List<Allele> alternateAlleles;
            if (ctx.getNAlleles() <= 1) {
                // not a variant, just REF
                alternateAlleles = Arrays.asList(Allele.NO_CALL);
            } else {
                alternateAlleles = ctx.getAlternateAlleles();
            }
            for (final Allele altAllele : alternateAlleles) {
                if (altAllele.isReference() || altAllele.equals(Allele.SPAN_DEL) || altAllele.equals(Allele.NON_REF_ALLELE))
                    continue;
                /* intergenic ====================================================== */
                if (transcripts.isEmpty()) {
                    Transcript leftGene = null;
                    String leftId = "";
                    String leftName = "";
                    for (Iterator<Transcript> iter = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, 1, ctx.getStart())).stream().flatMap(L -> L.stream()).iterator(); iter.hasNext(); ) {
                        final Transcript t = iter.next();
                        if (leftGene == null || leftGene.getEnd() < t.getEnd()) {
                            leftGene = t;
                            leftId = t.getGene().getId();
                            leftName = t.getGene().getGeneName();
                        }
                    }
                    Transcript rightGene = null;
                    String rightId = "";
                    String rightName = "";
                    for (Iterator<Transcript> iter = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, ctx.getEnd(), dict.getSequence(normalizedContig).getSequenceLength())).stream().flatMap(L -> L.stream()).iterator(); iter.hasNext(); ) {
                        final Transcript t = iter.next();
                        if (rightGene == null || t.getStart() < rightGene.getStart()) {
                            rightGene = t;
                            rightId = t.getGene().getId();
                            rightName = t.getGene().getGeneName();
                        }
                    }
                    // intergenic
                    final Annotation annot = new Annotation(altAllele);
                    annot.seqont.add("intergenic");
                    annot.geneId = leftId + "-" + rightId;
                    annot.geneName = leftName + "-" + rightName;
                    all_annotations.add(annot);
                } else {
                    for (final Transcript transcript : transcripts) {
                        final Annotation annotation = new Annotation(altAllele, transcript);
                        all_annotations.add(annotation);
                        if (!transcript.overlaps(ctx)) {
                            if (((ctx.getEnd() < transcript.getStart() && transcript.isNegativeStrand()) || (ctx.getStart() > transcript.getEnd() && transcript.isPositiveStrand()))) {
                                if (ctx.withinDistanceOf(transcript, 500)) {
                                    annotation.seqont.add("500B_downstream_variant");
                                } else if (ctx.withinDistanceOf(transcript, 2_000)) {
                                    annotation.seqont.add("2KB_downstream_variant");
                                }
                            } else if (((ctx.getEnd() < transcript.getStart() && transcript.isPositiveStrand()) || (ctx.getStart() > transcript.getEnd() && transcript.isNegativeStrand()))) {
                                if (ctx.withinDistanceOf(transcript, 2_000)) {
                                    annotation.seqont.add("2KB_upstream_variant");
                                } else if (ctx.withinDistanceOf(transcript, 5_000)) {
                                    annotation.seqont.add("5KB_upstream_variant");
                                }
                            }
                            continue;
                        }
                        if (CoordMath.encloses(ctx.getStart(), ctx.getEnd(), transcript.getStart(), transcript.getEnd())) {
                            // TODO can be inversion ,etc...
                            annotation.seqont.add("transcript_ablation");
                            continue;
                        }
                        for (int side = 0; side < 2; ++side) {
                            final Optional<UTR> opt_utr = (side == 0 ? transcript.getTranscriptUTR5() : transcript.getTranscriptUTR3());
                            if (!opt_utr.isPresent())
                                continue;
                            final UTR utr = opt_utr.get();
                            if (CoordMath.overlaps(utr.getStart(), utr.getEnd(), ctx.getStart(), ctx.getEnd())) {
                                annotation.seqont.add(side == 0 ? "5_prime_UTR_variant" : "3_prime_UTR_variant");
                            }
                        }
                        for (int side = 0; side < 2; ++side) {
                            final Optional<? extends ExonOrIntron> opt_ex;
                            if (side == 0) {
                                opt_ex = transcript.getExons().stream().filter(E -> E.overlaps(ctx)).findFirst();
                            } else {
                                opt_ex = transcript.getIntrons().stream().filter(E -> E.overlaps(ctx)).findFirst();
                            }
                            if (!opt_ex.isPresent())
                                continue;
                            final ExonOrIntron ei = opt_ex.get();
                            if (side == 0) {
                                if (transcript.isNonCoding())
                                    annotation.seqont.add("non_coding_transcript_exon_variant");
                            } else {
                                if (transcript.isNonCoding())
                                    annotation.seqont.add("non_coding_transcript_intron_variant");
                                annotation.seqont.add("intron");
                            }
                            if (ctx.getStart() == ctx.getEnd() && ei.isSplicing(ctx.getStart())) {
                                if (ei.isSplicingAcceptor(ctx.getStart())) {
                                    // SPLICING_ACCEPTOR
                                    annotation.seqont.add("splice_acceptor");
                                } else if (ei.isSplicingDonor(ctx.getStart())) {
                                    // SPLICING_DONOR
                                    annotation.seqont.add("splice_donor");
                                } else // ??
                                {
                                    annotation.seqont.add("splicing_variant");
                                }
                            }
                        }
                        final StructuralVariantType svType = ctx.getStructuralVariantType();
                        if (svType != null) {
                            continue;
                        }
                        if (transcript.isNonCoding()) {
                            // TODO
                            annotation.seqont.add("non_coding_transcript_variant");
                            continue;
                        }
                        RNASequence cDNA = this.transcriptId2cdna.get(transcript.getId());
                        if (cDNA == null) {
                            cDNA = rnaSeqFactory.getCodingRNA(transcript);
                            this.transcriptId2cdna.put(transcript.getId(), cDNA);
                        }
                        final OptionalInt opt_pos_cdna0 = cDNA.convertGenomic0ToRnaIndex0(ctx.getStart() - 1);
                        if (!opt_pos_cdna0.isPresent())
                            continue;
                        final int pos_cdna0 = opt_pos_cdna0.getAsInt();
                        final int pos_aa = pos_cdna0 / 3;
                        final GeneticCode geneticCode = GeneticCode.getStandard();
                        if (AcidNucleics.isATGC(altAllele.getBaseString())) {
                            String bases = altAllele.getBaseString().toUpperCase();
                            if (transcript.isNegativeStrand()) {
                                bases = AcidNucleics.reverseComplement(bases);
                            }
                            final MutedSequence mutRNA = new MutedSequence(cDNA, pos_cdna0, ctx.getReference().length(), bases);
                            final PeptideSequence<CharSequence> wildProt = PeptideSequence.of(cDNA, geneticCode);
                            final PeptideSequence<CharSequence> mutProt = PeptideSequence.of(mutRNA, geneticCode);
                            final int mod = pos_cdna0 % 3;
                            annotation.wildCodon = ("" + cDNA.charAt(pos_cdna0 - mod + 0) + cDNA.charAt(pos_cdna0 - mod + 1) + cDNA.charAt(pos_cdna0 - mod + 2));
                            annotation.mutCodon = ("" + mutRNA.charAt(pos_cdna0 - mod + 0) + mutRNA.charAt(pos_cdna0 - mod + 1) + mutRNA.charAt(pos_cdna0 - mod + 2));
                            annotation.position_protein = (pos_aa + 1);
                            annotation.wildAA = String.valueOf(wildProt.charAt(pos_aa));
                            annotation.mutAA = (String.valueOf(mutProt.charAt(pos_aa)));
                            if (isStop(wildProt.charAt(pos_aa)) && !isStop(mutProt.charAt(pos_aa))) {
                                annotation.seqont.add("stop_lost");
                            } else if (!isStop(wildProt.charAt(pos_aa)) && isStop(mutProt.charAt(pos_aa))) {
                                annotation.seqont.add("stop_gained");
                            } else if (wildProt.charAt(pos_aa) == mutProt.charAt(pos_aa)) {
                                annotation.seqont.add("synonymous");
                            } else {
                                annotation.seqont.add("missense_variant");
                            }
                        }
                    }
                }
            }
            final Set<String> info = new HashSet<String>(all_annotations.size());
            for (final Annotation a : all_annotations) {
                info.add(a.toString());
            }
            final VariantContextBuilder vb = new VariantContextBuilder(ctx);
            final String thetag;
            switch(this.outputSyntax) {
                case Vep:
                    thetag = "CSQ";
                    break;
                case SnpEff:
                    thetag = "ANN";
                    break;
                default:
                    thetag = TAG;
                    break;
            }
            vb.attribute(thetag, info.toArray());
            w.add(vb.make());
        }
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(w);
        CloserUtil.close(r);
        CloserUtil.close(this.referenceGenome);
    }
}
Also used : Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) VCFHeader(htsjdk.variant.vcf.VCFHeader) ExonOrIntron(com.github.lindenb.jvarkit.util.bio.structure.ExonOrIntron) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) StringUtil(htsjdk.samtools.util.StringUtil) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) ReferenceSequenceFileFactory(htsjdk.samtools.reference.ReferenceSequenceFileFactory) List(java.util.List) StructuralVariantType(htsjdk.variant.variantcontext.StructuralVariantType) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) CoordMath(htsjdk.samtools.util.CoordMath) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Optional(java.util.Optional) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) PeptideSequence(com.github.lindenb.jvarkit.util.bio.structure.PeptideSequence) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) VCFIterator(htsjdk.variant.vcf.VCFIterator) Parameter(com.beust.jcommander.Parameter) AcidNucleics(com.github.lindenb.jvarkit.util.bio.AcidNucleics) OptionalInt(java.util.OptionalInt) RNASequenceFactory(com.github.lindenb.jvarkit.util.bio.structure.RNASequenceFactory) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Interval(htsjdk.samtools.util.Interval) DelegateCharSequence(com.github.lindenb.jvarkit.lang.DelegateCharSequence) GeneticCode(com.github.lindenb.jvarkit.util.bio.GeneticCode) WeakHashMap(java.util.WeakHashMap) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) RNASequence(com.github.lindenb.jvarkit.util.bio.structure.RNASequence) Iterator(java.util.Iterator) UTR(com.github.lindenb.jvarkit.util.bio.structure.UTR) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) OnePassVcfLauncher(com.github.lindenb.jvarkit.jcommander.OnePassVcfLauncher) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) UTR(com.github.lindenb.jvarkit.util.bio.structure.UTR) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) VCFHeader(htsjdk.variant.vcf.VCFHeader) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) HashSet(java.util.HashSet) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) DelegateCharSequence(com.github.lindenb.jvarkit.lang.DelegateCharSequence) RNASequence(com.github.lindenb.jvarkit.util.bio.structure.RNASequence) OptionalInt(java.util.OptionalInt) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) RNASequenceFactory(com.github.lindenb.jvarkit.util.bio.structure.RNASequenceFactory) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) ExonOrIntron(com.github.lindenb.jvarkit.util.bio.structure.ExonOrIntron) GeneticCode(com.github.lindenb.jvarkit.util.bio.GeneticCode) StructuralVariantType(htsjdk.variant.variantcontext.StructuralVariantType)

Example 3 with UTR

use of com.github.lindenb.jvarkit.util.bio.structure.UTR in project jvarkit by lindenb.

the class VcfBurdenGtf method runBurden.

@Override
protected void runBurden(PrintWriter pw, VCFReader vcfReader, VariantContextWriter vcw) throws IOException {
    final SAMSequenceDictionary vcfDict = SequenceDictionaryUtils.extractRequired(vcfReader.getHeader());
    final List<Gene> all_genes;
    try (GtfReader gtfReader = new GtfReader(this.gtfFile)) {
        gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(vcfDict));
        all_genes = gtfReader.getAllGenes().stream().filter(G -> StringUtil.isBlank(this.intergenic_contig) || this.intergenic_contig.equals("*") || this.intergenic_contig.equals(G.getContig())).sorted(new ContigDictComparator(vcfDict).createLocatableComparator()).collect(Collectors.toCollection(ArrayList::new));
    }
    pw.print("#chrom");
    pw.print("\t");
    pw.print("start0");
    pw.print("\t");
    pw.print("end");
    pw.print("\t");
    pw.print("name");
    pw.print("\t");
    pw.print("length");
    pw.print("\t");
    pw.print("gene");
    pw.print("\t");
    pw.print("type");
    pw.print("\t");
    pw.print("strand");
    pw.print("\t");
    pw.print("transcript");
    pw.print("\t");
    pw.print("gene-id");
    pw.print("\t");
    pw.print("intervals");
    pw.print("\t");
    pw.print("p-value");
    pw.print("\t");
    pw.print("affected_alt");
    pw.print("\t");
    pw.print("affected_hom");
    pw.print("\t");
    pw.print("unaffected_alt");
    pw.print("\t");
    pw.print("unaffected_hom");
    pw.print("\t");
    pw.print("variants.count");
    pw.println();
    final List<SimpleInterval> all_intergenic = new ArrayList<>();
    if (!StringUtil.isBlank(this.intergenic_contig)) {
        for (final SAMSequenceRecord ssr : vcfDict.getSequences()) {
            if (!(this.intergenic_contig.equals("*") || this.intergenic_contig.equals(ssr.getSequenceName())))
                continue;
            final BitSet filled = new BitSet(ssr.getSequenceLength() + 2);
            all_genes.stream().filter(G -> G.getContig().equals(ssr.getSequenceName())).forEach(G -> filled.set(G.getStart(), 1 + /* bit set is 0 based */
            Math.min(G.getEnd(), ssr.getSequenceLength())));
            int i = 1;
            while (i < ssr.getSequenceLength()) {
                if (filled.get(i)) {
                    i++;
                    continue;
                }
                int j = i;
                while (j < ssr.getSequenceLength() && !filled.get(j)) {
                    j++;
                }
                all_intergenic.add(new SimpleInterval(ssr.getSequenceName(), i, j));
                i = j + 1;
            }
            all_genes.removeIf(G -> G.getContig().equals(ssr.getSequenceName()));
        }
        all_genes.clear();
    }
    final ProgressFactory.Watcher<Gene> progress = ProgressFactory.newInstance().logger(LOG).dictionary(vcfDict).build();
    /* run genes */
    for (final Gene gene : all_genes) {
        progress.apply(gene);
        final IntervalTree<VariantContext> intervalTree = new IntervalTree<>();
        vcfReader.query(gene).stream().filter(V -> accept(V)).forEach(V -> intervalTree.put(V.getStart(), V.getEnd(), V));
        if (intervalTree.size() == 0)
            continue;
        for (final Transcript transcript : gene.getTranscripts()) {
            final List<SubPartOfTranscript> parts = new ArrayList<>();
            parts.addAll(transcript.getExons().stream().map(R -> new SubPartOfTranscript(R)).collect(Collectors.toList()));
            parts.addAll(transcript.getIntrons().stream().map(R -> new SubPartOfTranscript(R)).collect(Collectors.toList()));
            final int intron_window_size = 1000;
            final int intron_window_shift = 500;
            for (final Intron intron : transcript.getIntrons()) {
                if (intron.getLengthOnReference() <= intron_window_size)
                    continue;
                int start_pos = intron.getStart();
                while (start_pos + intron_window_size <= intron.getEnd()) {
                    int xend = Math.min(intron.getEnd(), start_pos + intron_window_size - 1);
                    int xstart = xend - intron_window_size - 1;
                    parts.add(new SubPartOfTranscript(transcript, intron.getName() + ".Sliding", Collections.singletonList(new SimpleInterval(intron.getContig(), xstart, xend))));
                    start_pos += intron_window_shift;
                }
            }
            for (final UTR utr : transcript.getUTRs()) {
                parts.add(new SubPartOfTranscript(transcript, utr.getName(), utr.getIntervals()));
            }
            if (transcript.getExonCount() > 1) {
                parts.add(new SubPartOfTranscript(transcript, "AllExons", transcript.getExons().stream().map(E -> E.toInterval()).collect(Collectors.toList())));
            }
            if (transcript.hasCodonStartDefined() && transcript.hasCodonStopDefined() && transcript.getAllCds().size() > 1) {
                parts.add(new SubPartOfTranscript(transcript, "AllCds", transcript.getAllCds().stream().map(E -> E.toInterval()).collect(Collectors.toList())));
            }
            final int L = transcript.getTranscriptLength();
            final int[] index2genomic = new int[L];
            int pos = 0;
            for (final Exon exon : transcript.getExons()) {
                for (int i = exon.getStart(); i <= exon.getEnd(); i++) {
                    index2genomic[pos] = i;
                    pos++;
                }
            }
            final int window_size = 200;
            final int window_shift = 100;
            int array_index = 0;
            while (array_index < index2genomic.length) {
                final List<Locatable> intervals = new ArrayList<>();
                int prev_pos = -1;
                int start_pos = index2genomic[array_index];
                int i = 0;
                while (i < window_size && array_index + i < index2genomic.length) {
                    final int curr_pos = index2genomic[array_index + i];
                    if (i > 0 && prev_pos + 1 != curr_pos) {
                        intervals.add(new SimpleInterval(transcript.getContig(), start_pos, prev_pos));
                        start_pos = curr_pos;
                    }
                    prev_pos = curr_pos;
                    i++;
                }
                intervals.add(new SimpleInterval(transcript.getContig(), start_pos, prev_pos));
                parts.add(new SubPartOfTranscript(transcript, "Sliding", intervals));
                array_index += window_shift;
            }
            for (final SubPartOfTranscript part : parts) {
                final List<VariantContext> variants = new ArrayList<>();
                for (final Locatable loc : part.intervals) {
                    Iterator<IntervalTree.Node<VariantContext>> iter = intervalTree.overlappers(loc.getStart(), loc.getEnd());
                    while (iter.hasNext()) variants.add(iter.next().getValue());
                }
                if (variants.isEmpty())
                    continue;
                final FisherResult fisher = runFisher(variants);
                if (fisher.p_value > this.fisherTreshold)
                    continue;
                if (vcw != null) {
                    for (final VariantContext ctx : variants) {
                        vcw.add(new VariantContextBuilder(ctx).attribute(BURDEN_KEY, VCFUtils.escapeInfoField(part.label)).make());
                    }
                }
                pw.print(part.getContig());
                pw.print("\t");
                pw.print(part.getStart() - 1);
                pw.print("\t");
                pw.print(part.getEnd());
                pw.print("\t");
                pw.print(part.label);
                pw.print("\t");
                pw.print(part.getLengthOnReference());
                pw.print("\t");
                pw.print(transcript.getProperties().getOrDefault("gene_name", "."));
                pw.print("\t");
                pw.print(transcript.getProperties().getOrDefault("transcript_type", "."));
                pw.print("\t");
                pw.print(gene.getStrand());
                pw.print("\t");
                pw.print(transcript.getId());
                pw.print("\t");
                pw.print(gene.getId());
                pw.print("\t");
                pw.print(part.intervals.stream().map(R -> String.valueOf(R.getStart()) + "-" + R.getEnd()).collect(Collectors.joining(";")));
                pw.print("\t");
                pw.print(fisher.p_value);
                pw.print("\t");
                pw.print(fisher.affected_alt);
                pw.print("\t");
                pw.print(fisher.affected_hom);
                pw.print("\t");
                pw.print(fisher.unaffected_alt);
                pw.print("\t");
                pw.print(fisher.unaffected_hom);
                pw.print("\t");
                pw.print(variants.size());
                pw.println();
            }
        }
    }
    progress.close();
    final ProgressFactory.Watcher<SimpleInterval> progress2 = ProgressFactory.newInstance().logger(LOG).dictionary(vcfDict).build();
    /**
     * scan intergenics ...
     */
    for (final SimpleInterval intergenic : all_intergenic) {
        progress2.apply(intergenic);
        final int intergenic_window_size = 2000;
        final int intergenic_window_shifr = 100;
        final List<SimpleInterval> parts = new ArrayList<>();
        if (intergenic.getLengthOnReference() <= intergenic_window_size)
            continue;
        int start_pos = intergenic.getStart();
        while (start_pos + intergenic_window_size <= intergenic.getEnd()) {
            int xend = Math.min(intergenic.getEnd(), start_pos + intergenic_window_size - 1);
            int xstart = xend - intergenic_window_size - 1;
            parts.add(new SimpleInterval(intergenic.getContig(), xstart, xend));
            start_pos += intergenic_window_shifr;
        }
        for (final SimpleInterval part : parts) {
            final List<VariantContext> variants = vcfReader.query(part).stream().filter(V -> accept(V)).collect(Collectors.toList());
            if (variants.isEmpty())
                continue;
            final FisherResult fisher = runFisher(variants);
            if (fisher.p_value > this.fisherTreshold)
                continue;
            final String label = "intergenic_" + part.getStart() + "_" + part.getEnd();
            if (vcw != null) {
                for (final VariantContext ctx : variants) {
                    vcw.add(new VariantContextBuilder(ctx).attribute(BURDEN_KEY, VCFUtils.escapeInfoField(label)).make());
                }
            }
            pw.print(part.getContig());
            pw.print("\t");
            pw.print(part.getStart() - 1);
            pw.print("\t");
            pw.print(part.getEnd());
            pw.print("\t");
            pw.print(label);
            pw.print("\t");
            pw.print(part.getLengthOnReference());
            pw.print("\t");
            pw.print(".");
            pw.print("\t");
            pw.print("intergenic");
            pw.print("\t");
            pw.print(".");
            pw.print("\t");
            pw.print(".");
            pw.print("\t");
            pw.print(".");
            pw.print("\t");
            pw.print("" + part.getStart() + "-" + part.getEnd());
            pw.print("\t");
            pw.print(fisher.p_value);
            pw.print("\t");
            pw.print(fisher.affected_alt);
            pw.print("\t");
            pw.print(fisher.affected_hom);
            pw.print("\t");
            pw.print(fisher.unaffected_alt);
            pw.print("\t");
            pw.print(fisher.unaffected_hom);
            pw.print("\t");
            pw.print(variants.size());
            pw.println();
        }
    }
    progress2.close();
}
Also used : VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) JexlVariantPredicate(com.github.lindenb.jvarkit.util.vcf.JexlVariantPredicate) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) Exon(com.github.lindenb.jvarkit.util.bio.structure.Exon) Gene(com.github.lindenb.jvarkit.util.bio.structure.Gene) ArrayList(java.util.ArrayList) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) StringUtil(htsjdk.samtools.util.StringUtil) Path(java.nio.file.Path) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) Locatable(htsjdk.samtools.util.Locatable) Iterator(java.util.Iterator) Predicate(java.util.function.Predicate) UTR(com.github.lindenb.jvarkit.util.bio.structure.UTR) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) VCFReader(htsjdk.variant.vcf.VCFReader) IOException(java.io.IOException) IntervalTree(htsjdk.samtools.util.IntervalTree) Collectors(java.util.stream.Collectors) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) List(java.util.List) Intron(com.github.lindenb.jvarkit.util.bio.structure.Intron) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VariantContext(htsjdk.variant.variantcontext.VariantContext) BitSet(java.util.BitSet) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Collections(java.util.Collections) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) TranscriptInterval(com.github.lindenb.jvarkit.util.bio.structure.TranscriptInterval) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) UTR(com.github.lindenb.jvarkit.util.bio.structure.UTR) Exon(com.github.lindenb.jvarkit.util.bio.structure.Exon) Gene(com.github.lindenb.jvarkit.util.bio.structure.Gene) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) IntervalTree(htsjdk.samtools.util.IntervalTree) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) BitSet(java.util.BitSet) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) Intron(com.github.lindenb.jvarkit.util.bio.structure.Intron) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) Locatable(htsjdk.samtools.util.Locatable)

Aggregations

Parameter (com.beust.jcommander.Parameter)3 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)3 ContigNameConverter (com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter)3 GtfReader (com.github.lindenb.jvarkit.util.bio.structure.GtfReader)3 Transcript (com.github.lindenb.jvarkit.util.bio.structure.Transcript)3 UTR (com.github.lindenb.jvarkit.util.bio.structure.UTR)3 Program (com.github.lindenb.jvarkit.util.jcommander.Program)3 Logger (com.github.lindenb.jvarkit.util.log.Logger)3 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)3 Path (java.nio.file.Path)3 List (java.util.List)3 Collectors (java.util.stream.Collectors)3 SimpleInterval (com.github.lindenb.jvarkit.samtools.util.SimpleInterval)2 JVarkitVersion (com.github.lindenb.jvarkit.util.JVarkitVersion)2 AcidNucleics (com.github.lindenb.jvarkit.util.bio.AcidNucleics)2 GeneticCode (com.github.lindenb.jvarkit.util.bio.GeneticCode)2 Exon (com.github.lindenb.jvarkit.util.bio.structure.Exon)2 Gene (com.github.lindenb.jvarkit.util.bio.structure.Gene)2 ProgressFactory (com.github.lindenb.jvarkit.util.log.ProgressFactory)2 GenomicSequence (com.github.lindenb.jvarkit.util.picard.GenomicSequence)2