Search in sources :

Example 1 with ExonOrIntron

use of com.github.lindenb.jvarkit.util.bio.structure.ExonOrIntron in project jvarkit by lindenb.

the class VCFPredictions method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, VCFIterator r, VariantContextWriter w) {
    try {
        this.referenceGenome = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidxPath);
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.referenceGenome);
        final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
        loadGtf(dict);
        final VCFHeader header = r.getHeader();
        final VCFHeader h2 = new VCFHeader(header);
        JVarkitVersion.getInstance().addMetaData(this, h2);
        switch(this.outputSyntax) {
            case Vep:
                {
                    h2.addMetaDataLine(new VCFInfoHeaderLine("CSQ", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Consequence type as predicted by VEP" + ". Format: Allele|Feature|Feature_type|Consequence|CDS_position|Protein_position|Amino_acids|Codons"));
                    break;
                }
            case SnpEff:
                {
                    h2.addMetaDataLine(new VCFInfoHeaderLine("ANN", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'"));
                    break;
                }
            default:
                {
                    final StringBuilder format = new StringBuilder();
                    for (FORMAT1 f : FORMAT1.values()) {
                        if (format.length() > 0)
                            format.append("|");
                        format.append(f.name());
                    }
                    h2.addMetaDataLine(new VCFInfoHeaderLine(TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Prediction from " + getClass().getSimpleName() + ". Format: " + format));
                    break;
                }
        }
        w.writeHeader(h2);
        final RNASequenceFactory rnaSeqFactory = new RNASequenceFactory();
        rnaSeqFactory.setContigToGenomicSequence(S -> getGenomicSequence(S));
        while (r.hasNext()) {
            final VariantContext ctx = r.next();
            final String normalizedContig = contigNameConverter.apply(ctx.getContig());
            if (StringUtil.isBlank(normalizedContig)) {
                w.add(ctx);
                continue;
            }
            final List<Transcript> transcripts = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, ctx.getStart(), ctx.getEnd())).stream().flatMap(L -> L.stream()).collect(Collectors.toList());
            final List<Annotation> all_annotations = new ArrayList<>();
            final List<Allele> alternateAlleles;
            if (ctx.getNAlleles() <= 1) {
                // not a variant, just REF
                alternateAlleles = Arrays.asList(Allele.NO_CALL);
            } else {
                alternateAlleles = ctx.getAlternateAlleles();
            }
            for (final Allele altAllele : alternateAlleles) {
                if (altAllele.isReference() || altAllele.equals(Allele.SPAN_DEL) || altAllele.equals(Allele.NON_REF_ALLELE))
                    continue;
                /* intergenic ====================================================== */
                if (transcripts.isEmpty()) {
                    Transcript leftGene = null;
                    String leftId = "";
                    String leftName = "";
                    for (Iterator<Transcript> iter = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, 1, ctx.getStart())).stream().flatMap(L -> L.stream()).iterator(); iter.hasNext(); ) {
                        final Transcript t = iter.next();
                        if (leftGene == null || leftGene.getEnd() < t.getEnd()) {
                            leftGene = t;
                            leftId = t.getGene().getId();
                            leftName = t.getGene().getGeneName();
                        }
                    }
                    Transcript rightGene = null;
                    String rightId = "";
                    String rightName = "";
                    for (Iterator<Transcript> iter = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, ctx.getEnd(), dict.getSequence(normalizedContig).getSequenceLength())).stream().flatMap(L -> L.stream()).iterator(); iter.hasNext(); ) {
                        final Transcript t = iter.next();
                        if (rightGene == null || t.getStart() < rightGene.getStart()) {
                            rightGene = t;
                            rightId = t.getGene().getId();
                            rightName = t.getGene().getGeneName();
                        }
                    }
                    // intergenic
                    final Annotation annot = new Annotation(altAllele);
                    annot.seqont.add("intergenic");
                    annot.geneId = leftId + "-" + rightId;
                    annot.geneName = leftName + "-" + rightName;
                    all_annotations.add(annot);
                } else {
                    for (final Transcript transcript : transcripts) {
                        final Annotation annotation = new Annotation(altAllele, transcript);
                        all_annotations.add(annotation);
                        if (!transcript.overlaps(ctx)) {
                            if (((ctx.getEnd() < transcript.getStart() && transcript.isNegativeStrand()) || (ctx.getStart() > transcript.getEnd() && transcript.isPositiveStrand()))) {
                                if (ctx.withinDistanceOf(transcript, 500)) {
                                    annotation.seqont.add("500B_downstream_variant");
                                } else if (ctx.withinDistanceOf(transcript, 2_000)) {
                                    annotation.seqont.add("2KB_downstream_variant");
                                }
                            } else if (((ctx.getEnd() < transcript.getStart() && transcript.isPositiveStrand()) || (ctx.getStart() > transcript.getEnd() && transcript.isNegativeStrand()))) {
                                if (ctx.withinDistanceOf(transcript, 2_000)) {
                                    annotation.seqont.add("2KB_upstream_variant");
                                } else if (ctx.withinDistanceOf(transcript, 5_000)) {
                                    annotation.seqont.add("5KB_upstream_variant");
                                }
                            }
                            continue;
                        }
                        if (CoordMath.encloses(ctx.getStart(), ctx.getEnd(), transcript.getStart(), transcript.getEnd())) {
                            // TODO can be inversion ,etc...
                            annotation.seqont.add("transcript_ablation");
                            continue;
                        }
                        for (int side = 0; side < 2; ++side) {
                            final Optional<UTR> opt_utr = (side == 0 ? transcript.getTranscriptUTR5() : transcript.getTranscriptUTR3());
                            if (!opt_utr.isPresent())
                                continue;
                            final UTR utr = opt_utr.get();
                            if (CoordMath.overlaps(utr.getStart(), utr.getEnd(), ctx.getStart(), ctx.getEnd())) {
                                annotation.seqont.add(side == 0 ? "5_prime_UTR_variant" : "3_prime_UTR_variant");
                            }
                        }
                        for (int side = 0; side < 2; ++side) {
                            final Optional<? extends ExonOrIntron> opt_ex;
                            if (side == 0) {
                                opt_ex = transcript.getExons().stream().filter(E -> E.overlaps(ctx)).findFirst();
                            } else {
                                opt_ex = transcript.getIntrons().stream().filter(E -> E.overlaps(ctx)).findFirst();
                            }
                            if (!opt_ex.isPresent())
                                continue;
                            final ExonOrIntron ei = opt_ex.get();
                            if (side == 0) {
                                if (transcript.isNonCoding())
                                    annotation.seqont.add("non_coding_transcript_exon_variant");
                            } else {
                                if (transcript.isNonCoding())
                                    annotation.seqont.add("non_coding_transcript_intron_variant");
                                annotation.seqont.add("intron");
                            }
                            if (ctx.getStart() == ctx.getEnd() && ei.isSplicing(ctx.getStart())) {
                                if (ei.isSplicingAcceptor(ctx.getStart())) {
                                    // SPLICING_ACCEPTOR
                                    annotation.seqont.add("splice_acceptor");
                                } else if (ei.isSplicingDonor(ctx.getStart())) {
                                    // SPLICING_DONOR
                                    annotation.seqont.add("splice_donor");
                                } else // ??
                                {
                                    annotation.seqont.add("splicing_variant");
                                }
                            }
                        }
                        final StructuralVariantType svType = ctx.getStructuralVariantType();
                        if (svType != null) {
                            continue;
                        }
                        if (transcript.isNonCoding()) {
                            // TODO
                            annotation.seqont.add("non_coding_transcript_variant");
                            continue;
                        }
                        RNASequence cDNA = this.transcriptId2cdna.get(transcript.getId());
                        if (cDNA == null) {
                            cDNA = rnaSeqFactory.getCodingRNA(transcript);
                            this.transcriptId2cdna.put(transcript.getId(), cDNA);
                        }
                        final OptionalInt opt_pos_cdna0 = cDNA.convertGenomic0ToRnaIndex0(ctx.getStart() - 1);
                        if (!opt_pos_cdna0.isPresent())
                            continue;
                        final int pos_cdna0 = opt_pos_cdna0.getAsInt();
                        final int pos_aa = pos_cdna0 / 3;
                        final GeneticCode geneticCode = GeneticCode.getStandard();
                        if (AcidNucleics.isATGC(altAllele.getBaseString())) {
                            String bases = altAllele.getBaseString().toUpperCase();
                            if (transcript.isNegativeStrand()) {
                                bases = AcidNucleics.reverseComplement(bases);
                            }
                            final MutedSequence mutRNA = new MutedSequence(cDNA, pos_cdna0, ctx.getReference().length(), bases);
                            final PeptideSequence<CharSequence> wildProt = PeptideSequence.of(cDNA, geneticCode);
                            final PeptideSequence<CharSequence> mutProt = PeptideSequence.of(mutRNA, geneticCode);
                            final int mod = pos_cdna0 % 3;
                            annotation.wildCodon = ("" + cDNA.charAt(pos_cdna0 - mod + 0) + cDNA.charAt(pos_cdna0 - mod + 1) + cDNA.charAt(pos_cdna0 - mod + 2));
                            annotation.mutCodon = ("" + mutRNA.charAt(pos_cdna0 - mod + 0) + mutRNA.charAt(pos_cdna0 - mod + 1) + mutRNA.charAt(pos_cdna0 - mod + 2));
                            annotation.position_protein = (pos_aa + 1);
                            annotation.wildAA = String.valueOf(wildProt.charAt(pos_aa));
                            annotation.mutAA = (String.valueOf(mutProt.charAt(pos_aa)));
                            if (isStop(wildProt.charAt(pos_aa)) && !isStop(mutProt.charAt(pos_aa))) {
                                annotation.seqont.add("stop_lost");
                            } else if (!isStop(wildProt.charAt(pos_aa)) && isStop(mutProt.charAt(pos_aa))) {
                                annotation.seqont.add("stop_gained");
                            } else if (wildProt.charAt(pos_aa) == mutProt.charAt(pos_aa)) {
                                annotation.seqont.add("synonymous");
                            } else {
                                annotation.seqont.add("missense_variant");
                            }
                        }
                    }
                }
            }
            final Set<String> info = new HashSet<String>(all_annotations.size());
            for (final Annotation a : all_annotations) {
                info.add(a.toString());
            }
            final VariantContextBuilder vb = new VariantContextBuilder(ctx);
            final String thetag;
            switch(this.outputSyntax) {
                case Vep:
                    thetag = "CSQ";
                    break;
                case SnpEff:
                    thetag = "ANN";
                    break;
                default:
                    thetag = TAG;
                    break;
            }
            vb.attribute(thetag, info.toArray());
            w.add(vb.make());
        }
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(w);
        CloserUtil.close(r);
        CloserUtil.close(this.referenceGenome);
    }
}
Also used : Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) VCFHeader(htsjdk.variant.vcf.VCFHeader) ExonOrIntron(com.github.lindenb.jvarkit.util.bio.structure.ExonOrIntron) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) StringUtil(htsjdk.samtools.util.StringUtil) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) ReferenceSequenceFileFactory(htsjdk.samtools.reference.ReferenceSequenceFileFactory) List(java.util.List) StructuralVariantType(htsjdk.variant.variantcontext.StructuralVariantType) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) CoordMath(htsjdk.samtools.util.CoordMath) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Optional(java.util.Optional) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) PeptideSequence(com.github.lindenb.jvarkit.util.bio.structure.PeptideSequence) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) VCFIterator(htsjdk.variant.vcf.VCFIterator) Parameter(com.beust.jcommander.Parameter) AcidNucleics(com.github.lindenb.jvarkit.util.bio.AcidNucleics) OptionalInt(java.util.OptionalInt) RNASequenceFactory(com.github.lindenb.jvarkit.util.bio.structure.RNASequenceFactory) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Interval(htsjdk.samtools.util.Interval) DelegateCharSequence(com.github.lindenb.jvarkit.lang.DelegateCharSequence) GeneticCode(com.github.lindenb.jvarkit.util.bio.GeneticCode) WeakHashMap(java.util.WeakHashMap) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) RNASequence(com.github.lindenb.jvarkit.util.bio.structure.RNASequence) Iterator(java.util.Iterator) UTR(com.github.lindenb.jvarkit.util.bio.structure.UTR) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) OnePassVcfLauncher(com.github.lindenb.jvarkit.jcommander.OnePassVcfLauncher) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) UTR(com.github.lindenb.jvarkit.util.bio.structure.UTR) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) VCFHeader(htsjdk.variant.vcf.VCFHeader) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) HashSet(java.util.HashSet) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) DelegateCharSequence(com.github.lindenb.jvarkit.lang.DelegateCharSequence) RNASequence(com.github.lindenb.jvarkit.util.bio.structure.RNASequence) OptionalInt(java.util.OptionalInt) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) RNASequenceFactory(com.github.lindenb.jvarkit.util.bio.structure.RNASequenceFactory) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) ExonOrIntron(com.github.lindenb.jvarkit.util.bio.structure.ExonOrIntron) GeneticCode(com.github.lindenb.jvarkit.util.bio.GeneticCode) StructuralVariantType(htsjdk.variant.variantcontext.StructuralVariantType)

Aggregations

Parameter (com.beust.jcommander.Parameter)1 OnePassVcfLauncher (com.github.lindenb.jvarkit.jcommander.OnePassVcfLauncher)1 DelegateCharSequence (com.github.lindenb.jvarkit.lang.DelegateCharSequence)1 SimpleInterval (com.github.lindenb.jvarkit.samtools.util.SimpleInterval)1 JVarkitVersion (com.github.lindenb.jvarkit.util.JVarkitVersion)1 AcidNucleics (com.github.lindenb.jvarkit.util.bio.AcidNucleics)1 GeneticCode (com.github.lindenb.jvarkit.util.bio.GeneticCode)1 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)1 ContigNameConverter (com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter)1 ExonOrIntron (com.github.lindenb.jvarkit.util.bio.structure.ExonOrIntron)1 GtfReader (com.github.lindenb.jvarkit.util.bio.structure.GtfReader)1 PeptideSequence (com.github.lindenb.jvarkit.util.bio.structure.PeptideSequence)1 RNASequence (com.github.lindenb.jvarkit.util.bio.structure.RNASequence)1 RNASequenceFactory (com.github.lindenb.jvarkit.util.bio.structure.RNASequenceFactory)1 Transcript (com.github.lindenb.jvarkit.util.bio.structure.Transcript)1 UTR (com.github.lindenb.jvarkit.util.bio.structure.UTR)1 Program (com.github.lindenb.jvarkit.util.jcommander.Program)1 Logger (com.github.lindenb.jvarkit.util.log.Logger)1 GenomicSequence (com.github.lindenb.jvarkit.util.picard.GenomicSequence)1 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)1