Search in sources :

Example 1 with RNASequenceFactory

use of com.github.lindenb.jvarkit.util.bio.structure.RNASequenceFactory in project jvarkit by lindenb.

the class BackLocate method backLocate.

private void backLocate(final PrintStream out, final Transcript transcript, final String geneName, final AminoAcid aa1, final AminoAcid aa2, int peptidePos1, final String extraUserData) throws IOException {
    final Set<String> messages = new LinkedHashSet<>();
    final GeneticCode geneticCode = getGeneticCodeByChromosome(transcript.getContig());
    final RNASequenceFactory rnaSequenceFactory = new RNASequenceFactory();
    if (this.genomicContig == null || !this.genomicContig.getChrom().equals(transcript.getContig())) {
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.referenceGenome);
        final SAMSequenceRecord ssr = dict.getSequence(transcript.getContig());
        if (ssr == null) {
            LOG.warn(JvarkitException.ContigNotFoundInDictionary.getMessage(transcript.getContig(), dict));
            return;
        }
        this.genomicContig = new GenomicSequence(this.referenceGenome, transcript.getContig());
    }
    rnaSequenceFactory.setContigToGenomicSequence(C -> this.genomicContig);
    final RNASequence wildRNA = rnaSequenceFactory.getCodingRNA(transcript);
    final PeptideSequence<RNASequence> wildProt = PeptideSequence.of(wildRNA, geneticCode);
    final int peptideIndex0 = peptidePos1 - 1;
    if (peptideIndex0 >= wildProt.length()) {
        out.println("##index out of range for :" + transcript.getId() + " petide length=" + wildProt.length());
        return;
    }
    if (wildProt.charAt(peptideIndex0) != Character.toUpperCase(aa1.getOneLetterCode())) {
        messages.add("REF aminod acid [" + peptidePos1 + "] is not the same (" + wildProt.charAt(peptideIndex0) + "/" + aa1 + ")");
    }
    final int[] indexesInRNA = new int[] { 0 + peptideIndex0 * 3, 1 + peptideIndex0 * 3, 2 + peptideIndex0 * 3 };
    final String wildCodon = "" + wildRNA.charAt(indexesInRNA[0]) + wildRNA.charAt(indexesInRNA[1]) + wildRNA.charAt(indexesInRNA[2]);
    /* 2015 : adding possible mut codons */
    final Set<String> possibleAltCodons = new HashSet<>();
    final char[] bases = new char[] { 'A', 'C', 'G', 'T' };
    for (int codon_pos = 0; codon_pos < 3; ++codon_pos) {
        final StringBuilder sb = new StringBuilder(wildCodon);
        for (char mutBase : bases) {
            sb.setCharAt(codon_pos, mutBase);
            if (geneticCode.translate(sb.charAt(0), sb.charAt(1), sb.charAt(2)) == Character.toUpperCase(aa2.getOneLetterCode())) {
                possibleAltCodons.add(sb.toString());
            }
        }
    }
    for (int indexInRna : indexesInRNA) {
        out.print(geneName);
        out.print('\t');
        out.print(aa1.getThreeLettersCode());
        out.print('\t');
        out.print(peptidePos1);
        out.print('\t');
        out.print(aa2.getThreeLettersCode());
        out.print('\t');
        out.print(transcript.getGene().getGeneName());
        out.print('\t');
        out.print(transcript.getId());
        out.print('\t');
        out.print(transcript.getStrand());
        out.print('\t');
        out.print(wildProt.charAt(peptideIndex0));
        out.print('\t');
        out.print(indexInRna);
        out.print('\t');
        out.print(wildCodon);
        out.print('\t');
        if (possibleAltCodons.isEmpty()) {
            out.print('.');
        } else {
            out.print(String.join("|", possibleAltCodons));
        }
        out.print('\t');
        out.print(wildRNA.charAt(indexInRna));
        out.print('\t');
        out.print(transcript.getContig());
        out.print('\t');
        out.print(wildRNA.convertRnaIndex0ToGenomic0(indexInRna));
        out.print('\t');
        String exonName = null;
        for (final Exon exon : transcript.getExons()) {
            int genome = wildRNA.convertRnaIndex0ToGenomic0(indexInRna);
            if (exon.contains(genome)) {
                exonName = exon.getName();
                break;
            }
        }
        out.print(exonName);
        if (this.printSequences) {
            String s = wildRNA.toString();
            out.print('\t');
            out.print(s.substring(0, indexInRna) + "[" + s.charAt(indexInRna) + "]" + (indexInRna + 1 < s.length() ? s.substring(indexInRna + 1) : ""));
            s = wildProt.toString();
            out.print('\t');
            out.print(s.substring(0, peptideIndex0) + "[" + aa1 + "/" + aa2 + "/" + wildProt.charAt(peptideIndex0) + "]" + (peptideIndex0 + 1 < s.length() ? s.substring(peptideIndex0 + 1) : ""));
        }
        out.print('\t');
        out.print(messages.isEmpty() ? "." : String.join(";", messages));
        out.print('\t');
        out.print(extraUserData);
        out.println();
    }
}
Also used : LinkedHashSet(java.util.LinkedHashSet) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) RNASequence(com.github.lindenb.jvarkit.util.bio.structure.RNASequence) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) RNASequenceFactory(com.github.lindenb.jvarkit.util.bio.structure.RNASequenceFactory) Exon(com.github.lindenb.jvarkit.util.bio.structure.Exon) GeneticCode(com.github.lindenb.jvarkit.util.bio.GeneticCode) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 2 with RNASequenceFactory

use of com.github.lindenb.jvarkit.util.bio.structure.RNASequenceFactory in project jvarkit by lindenb.

the class VCFPredictions method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, VCFIterator r, VariantContextWriter w) {
    try {
        this.referenceGenome = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidxPath);
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.referenceGenome);
        final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
        loadGtf(dict);
        final VCFHeader header = r.getHeader();
        final VCFHeader h2 = new VCFHeader(header);
        JVarkitVersion.getInstance().addMetaData(this, h2);
        switch(this.outputSyntax) {
            case Vep:
                {
                    h2.addMetaDataLine(new VCFInfoHeaderLine("CSQ", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Consequence type as predicted by VEP" + ". Format: Allele|Feature|Feature_type|Consequence|CDS_position|Protein_position|Amino_acids|Codons"));
                    break;
                }
            case SnpEff:
                {
                    h2.addMetaDataLine(new VCFInfoHeaderLine("ANN", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'"));
                    break;
                }
            default:
                {
                    final StringBuilder format = new StringBuilder();
                    for (FORMAT1 f : FORMAT1.values()) {
                        if (format.length() > 0)
                            format.append("|");
                        format.append(f.name());
                    }
                    h2.addMetaDataLine(new VCFInfoHeaderLine(TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Prediction from " + getClass().getSimpleName() + ". Format: " + format));
                    break;
                }
        }
        w.writeHeader(h2);
        final RNASequenceFactory rnaSeqFactory = new RNASequenceFactory();
        rnaSeqFactory.setContigToGenomicSequence(S -> getGenomicSequence(S));
        while (r.hasNext()) {
            final VariantContext ctx = r.next();
            final String normalizedContig = contigNameConverter.apply(ctx.getContig());
            if (StringUtil.isBlank(normalizedContig)) {
                w.add(ctx);
                continue;
            }
            final List<Transcript> transcripts = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, ctx.getStart(), ctx.getEnd())).stream().flatMap(L -> L.stream()).collect(Collectors.toList());
            final List<Annotation> all_annotations = new ArrayList<>();
            final List<Allele> alternateAlleles;
            if (ctx.getNAlleles() <= 1) {
                // not a variant, just REF
                alternateAlleles = Arrays.asList(Allele.NO_CALL);
            } else {
                alternateAlleles = ctx.getAlternateAlleles();
            }
            for (final Allele altAllele : alternateAlleles) {
                if (altAllele.isReference() || altAllele.equals(Allele.SPAN_DEL) || altAllele.equals(Allele.NON_REF_ALLELE))
                    continue;
                /* intergenic ====================================================== */
                if (transcripts.isEmpty()) {
                    Transcript leftGene = null;
                    String leftId = "";
                    String leftName = "";
                    for (Iterator<Transcript> iter = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, 1, ctx.getStart())).stream().flatMap(L -> L.stream()).iterator(); iter.hasNext(); ) {
                        final Transcript t = iter.next();
                        if (leftGene == null || leftGene.getEnd() < t.getEnd()) {
                            leftGene = t;
                            leftId = t.getGene().getId();
                            leftName = t.getGene().getGeneName();
                        }
                    }
                    Transcript rightGene = null;
                    String rightId = "";
                    String rightName = "";
                    for (Iterator<Transcript> iter = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, ctx.getEnd(), dict.getSequence(normalizedContig).getSequenceLength())).stream().flatMap(L -> L.stream()).iterator(); iter.hasNext(); ) {
                        final Transcript t = iter.next();
                        if (rightGene == null || t.getStart() < rightGene.getStart()) {
                            rightGene = t;
                            rightId = t.getGene().getId();
                            rightName = t.getGene().getGeneName();
                        }
                    }
                    // intergenic
                    final Annotation annot = new Annotation(altAllele);
                    annot.seqont.add("intergenic");
                    annot.geneId = leftId + "-" + rightId;
                    annot.geneName = leftName + "-" + rightName;
                    all_annotations.add(annot);
                } else {
                    for (final Transcript transcript : transcripts) {
                        final Annotation annotation = new Annotation(altAllele, transcript);
                        all_annotations.add(annotation);
                        if (!transcript.overlaps(ctx)) {
                            if (((ctx.getEnd() < transcript.getStart() && transcript.isNegativeStrand()) || (ctx.getStart() > transcript.getEnd() && transcript.isPositiveStrand()))) {
                                if (ctx.withinDistanceOf(transcript, 500)) {
                                    annotation.seqont.add("500B_downstream_variant");
                                } else if (ctx.withinDistanceOf(transcript, 2_000)) {
                                    annotation.seqont.add("2KB_downstream_variant");
                                }
                            } else if (((ctx.getEnd() < transcript.getStart() && transcript.isPositiveStrand()) || (ctx.getStart() > transcript.getEnd() && transcript.isNegativeStrand()))) {
                                if (ctx.withinDistanceOf(transcript, 2_000)) {
                                    annotation.seqont.add("2KB_upstream_variant");
                                } else if (ctx.withinDistanceOf(transcript, 5_000)) {
                                    annotation.seqont.add("5KB_upstream_variant");
                                }
                            }
                            continue;
                        }
                        if (CoordMath.encloses(ctx.getStart(), ctx.getEnd(), transcript.getStart(), transcript.getEnd())) {
                            // TODO can be inversion ,etc...
                            annotation.seqont.add("transcript_ablation");
                            continue;
                        }
                        for (int side = 0; side < 2; ++side) {
                            final Optional<UTR> opt_utr = (side == 0 ? transcript.getTranscriptUTR5() : transcript.getTranscriptUTR3());
                            if (!opt_utr.isPresent())
                                continue;
                            final UTR utr = opt_utr.get();
                            if (CoordMath.overlaps(utr.getStart(), utr.getEnd(), ctx.getStart(), ctx.getEnd())) {
                                annotation.seqont.add(side == 0 ? "5_prime_UTR_variant" : "3_prime_UTR_variant");
                            }
                        }
                        for (int side = 0; side < 2; ++side) {
                            final Optional<? extends ExonOrIntron> opt_ex;
                            if (side == 0) {
                                opt_ex = transcript.getExons().stream().filter(E -> E.overlaps(ctx)).findFirst();
                            } else {
                                opt_ex = transcript.getIntrons().stream().filter(E -> E.overlaps(ctx)).findFirst();
                            }
                            if (!opt_ex.isPresent())
                                continue;
                            final ExonOrIntron ei = opt_ex.get();
                            if (side == 0) {
                                if (transcript.isNonCoding())
                                    annotation.seqont.add("non_coding_transcript_exon_variant");
                            } else {
                                if (transcript.isNonCoding())
                                    annotation.seqont.add("non_coding_transcript_intron_variant");
                                annotation.seqont.add("intron");
                            }
                            if (ctx.getStart() == ctx.getEnd() && ei.isSplicing(ctx.getStart())) {
                                if (ei.isSplicingAcceptor(ctx.getStart())) {
                                    // SPLICING_ACCEPTOR
                                    annotation.seqont.add("splice_acceptor");
                                } else if (ei.isSplicingDonor(ctx.getStart())) {
                                    // SPLICING_DONOR
                                    annotation.seqont.add("splice_donor");
                                } else // ??
                                {
                                    annotation.seqont.add("splicing_variant");
                                }
                            }
                        }
                        final StructuralVariantType svType = ctx.getStructuralVariantType();
                        if (svType != null) {
                            continue;
                        }
                        if (transcript.isNonCoding()) {
                            // TODO
                            annotation.seqont.add("non_coding_transcript_variant");
                            continue;
                        }
                        RNASequence cDNA = this.transcriptId2cdna.get(transcript.getId());
                        if (cDNA == null) {
                            cDNA = rnaSeqFactory.getCodingRNA(transcript);
                            this.transcriptId2cdna.put(transcript.getId(), cDNA);
                        }
                        final OptionalInt opt_pos_cdna0 = cDNA.convertGenomic0ToRnaIndex0(ctx.getStart() - 1);
                        if (!opt_pos_cdna0.isPresent())
                            continue;
                        final int pos_cdna0 = opt_pos_cdna0.getAsInt();
                        final int pos_aa = pos_cdna0 / 3;
                        final GeneticCode geneticCode = GeneticCode.getStandard();
                        if (AcidNucleics.isATGC(altAllele.getBaseString())) {
                            String bases = altAllele.getBaseString().toUpperCase();
                            if (transcript.isNegativeStrand()) {
                                bases = AcidNucleics.reverseComplement(bases);
                            }
                            final MutedSequence mutRNA = new MutedSequence(cDNA, pos_cdna0, ctx.getReference().length(), bases);
                            final PeptideSequence<CharSequence> wildProt = PeptideSequence.of(cDNA, geneticCode);
                            final PeptideSequence<CharSequence> mutProt = PeptideSequence.of(mutRNA, geneticCode);
                            final int mod = pos_cdna0 % 3;
                            annotation.wildCodon = ("" + cDNA.charAt(pos_cdna0 - mod + 0) + cDNA.charAt(pos_cdna0 - mod + 1) + cDNA.charAt(pos_cdna0 - mod + 2));
                            annotation.mutCodon = ("" + mutRNA.charAt(pos_cdna0 - mod + 0) + mutRNA.charAt(pos_cdna0 - mod + 1) + mutRNA.charAt(pos_cdna0 - mod + 2));
                            annotation.position_protein = (pos_aa + 1);
                            annotation.wildAA = String.valueOf(wildProt.charAt(pos_aa));
                            annotation.mutAA = (String.valueOf(mutProt.charAt(pos_aa)));
                            if (isStop(wildProt.charAt(pos_aa)) && !isStop(mutProt.charAt(pos_aa))) {
                                annotation.seqont.add("stop_lost");
                            } else if (!isStop(wildProt.charAt(pos_aa)) && isStop(mutProt.charAt(pos_aa))) {
                                annotation.seqont.add("stop_gained");
                            } else if (wildProt.charAt(pos_aa) == mutProt.charAt(pos_aa)) {
                                annotation.seqont.add("synonymous");
                            } else {
                                annotation.seqont.add("missense_variant");
                            }
                        }
                    }
                }
            }
            final Set<String> info = new HashSet<String>(all_annotations.size());
            for (final Annotation a : all_annotations) {
                info.add(a.toString());
            }
            final VariantContextBuilder vb = new VariantContextBuilder(ctx);
            final String thetag;
            switch(this.outputSyntax) {
                case Vep:
                    thetag = "CSQ";
                    break;
                case SnpEff:
                    thetag = "ANN";
                    break;
                default:
                    thetag = TAG;
                    break;
            }
            vb.attribute(thetag, info.toArray());
            w.add(vb.make());
        }
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(w);
        CloserUtil.close(r);
        CloserUtil.close(this.referenceGenome);
    }
}
Also used : Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) VCFHeader(htsjdk.variant.vcf.VCFHeader) ExonOrIntron(com.github.lindenb.jvarkit.util.bio.structure.ExonOrIntron) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) StringUtil(htsjdk.samtools.util.StringUtil) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) ReferenceSequenceFileFactory(htsjdk.samtools.reference.ReferenceSequenceFileFactory) List(java.util.List) StructuralVariantType(htsjdk.variant.variantcontext.StructuralVariantType) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) CoordMath(htsjdk.samtools.util.CoordMath) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Optional(java.util.Optional) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) PeptideSequence(com.github.lindenb.jvarkit.util.bio.structure.PeptideSequence) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) VCFIterator(htsjdk.variant.vcf.VCFIterator) Parameter(com.beust.jcommander.Parameter) AcidNucleics(com.github.lindenb.jvarkit.util.bio.AcidNucleics) OptionalInt(java.util.OptionalInt) RNASequenceFactory(com.github.lindenb.jvarkit.util.bio.structure.RNASequenceFactory) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Interval(htsjdk.samtools.util.Interval) DelegateCharSequence(com.github.lindenb.jvarkit.lang.DelegateCharSequence) GeneticCode(com.github.lindenb.jvarkit.util.bio.GeneticCode) WeakHashMap(java.util.WeakHashMap) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) RNASequence(com.github.lindenb.jvarkit.util.bio.structure.RNASequence) Iterator(java.util.Iterator) UTR(com.github.lindenb.jvarkit.util.bio.structure.UTR) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) OnePassVcfLauncher(com.github.lindenb.jvarkit.jcommander.OnePassVcfLauncher) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) UTR(com.github.lindenb.jvarkit.util.bio.structure.UTR) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) VCFHeader(htsjdk.variant.vcf.VCFHeader) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) HashSet(java.util.HashSet) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) DelegateCharSequence(com.github.lindenb.jvarkit.lang.DelegateCharSequence) RNASequence(com.github.lindenb.jvarkit.util.bio.structure.RNASequence) OptionalInt(java.util.OptionalInt) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) RNASequenceFactory(com.github.lindenb.jvarkit.util.bio.structure.RNASequenceFactory) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) ExonOrIntron(com.github.lindenb.jvarkit.util.bio.structure.ExonOrIntron) GeneticCode(com.github.lindenb.jvarkit.util.bio.GeneticCode) StructuralVariantType(htsjdk.variant.variantcontext.StructuralVariantType)

Aggregations

GeneticCode (com.github.lindenb.jvarkit.util.bio.GeneticCode)2 RNASequence (com.github.lindenb.jvarkit.util.bio.structure.RNASequence)2 RNASequenceFactory (com.github.lindenb.jvarkit.util.bio.structure.RNASequenceFactory)2 GenomicSequence (com.github.lindenb.jvarkit.util.picard.GenomicSequence)2 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)2 HashSet (java.util.HashSet)2 Parameter (com.beust.jcommander.Parameter)1 OnePassVcfLauncher (com.github.lindenb.jvarkit.jcommander.OnePassVcfLauncher)1 DelegateCharSequence (com.github.lindenb.jvarkit.lang.DelegateCharSequence)1 SimpleInterval (com.github.lindenb.jvarkit.samtools.util.SimpleInterval)1 JVarkitVersion (com.github.lindenb.jvarkit.util.JVarkitVersion)1 AcidNucleics (com.github.lindenb.jvarkit.util.bio.AcidNucleics)1 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)1 ContigNameConverter (com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter)1 Exon (com.github.lindenb.jvarkit.util.bio.structure.Exon)1 ExonOrIntron (com.github.lindenb.jvarkit.util.bio.structure.ExonOrIntron)1 GtfReader (com.github.lindenb.jvarkit.util.bio.structure.GtfReader)1 PeptideSequence (com.github.lindenb.jvarkit.util.bio.structure.PeptideSequence)1 Transcript (com.github.lindenb.jvarkit.util.bio.structure.Transcript)1 UTR (com.github.lindenb.jvarkit.util.bio.structure.UTR)1