Search in sources :

Example 1 with SequenceOntologyTree

use of com.github.lindenb.jvarkit.util.so.SequenceOntologyTree in project jvarkit by lindenb.

the class VCFPredictions method doVcfToVcf.

@Override
protected int doVcfToVcf(final String inputName, final VcfIterator r, VariantContextWriter w) {
    ReferenceContig genomicSequence = null;
    try {
        LOG.info("opening REF:" + this.referenceGenomeSource);
        this.referenceGenome = new ReferenceGenomeFactory().open(this.referenceGenomeSource);
        loadKnownGenesFromUri();
        final VCFHeader header = (VCFHeader) r.getHeader();
        final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(this.referenceGenome.getDictionary());
        contigNameConverter.setOnNotFound(OnNotFound.SKIP);
        final VCFHeader h2 = new VCFHeader(header);
        addMetaData(h2);
        switch(this.outputSyntax) {
            case Vep:
                {
                    h2.addMetaDataLine(new VCFInfoHeaderLine("CSQ", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Consequence type as predicted by VEP" + ". Format: Allele|Feature|Feature_type|Consequence|CDS_position|Protein_position|Amino_acids|Codons"));
                    break;
                }
            case SnpEff:
                {
                    h2.addMetaDataLine(new VCFInfoHeaderLine("ANN", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'"));
                    break;
                }
            default:
                {
                    final StringBuilder format = new StringBuilder();
                    for (FORMAT1 f : FORMAT1.values()) {
                        if (format.length() > 0)
                            format.append("|");
                        format.append(f.name());
                    }
                    h2.addMetaDataLine(new VCFInfoHeaderLine(TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Prediction from " + getClass().getSimpleName() + ". Format: " + format));
                    break;
                }
        }
        w.writeHeader(h2);
        final SequenceOntologyTree soTree = SequenceOntologyTree.getInstance();
        final SequenceOntologyTree.Term so_intron = soTree.getTermByAcn("SO:0001627");
        final SequenceOntologyTree.Term so_exon = soTree.getTermByAcn("SO:0001791");
        final SequenceOntologyTree.Term so_splice_donor = soTree.getTermByAcn("SO:0001575");
        final SequenceOntologyTree.Term so_splice_acceptor = soTree.getTermByAcn("SO:0001574");
        final SequenceOntologyTree.Term so_5_prime_UTR_variant = soTree.getTermByAcn("SO:0001623");
        final SequenceOntologyTree.Term so_3_prime_UTR_variant = soTree.getTermByAcn("SO:0001624");
        final SequenceOntologyTree.Term so_splicing_variant = soTree.getTermByAcn("SO:0001568");
        final SequenceOntologyTree.Term so_stop_lost = soTree.getTermByAcn("SO:0001578");
        final SequenceOntologyTree.Term so_stop_gained = soTree.getTermByAcn("SO:0001587");
        final SequenceOntologyTree.Term so_coding_synonymous = soTree.getTermByAcn("SO:0001819");
        final SequenceOntologyTree.Term so_coding_non_synonymous = soTree.getTermByAcn("SO:0001583");
        final SequenceOntologyTree.Term so_intergenic = soTree.getTermByAcn("SO:0001628");
        final SequenceOntologyTree.Term so_nc_transcript_variant = soTree.getTermByAcn("SO:0001619");
        final SequenceOntologyTree.Term so_non_coding_exon_variant = soTree.getTermByAcn("SO:0001792");
        final SequenceOntologyTree.Term _2KB_upstream_variant = soTree.getTermByAcn("SO:0001636");
        final SequenceOntologyTree.Term _5KB_upstream_variant = soTree.getTermByAcn("SO:0001635");
        final SequenceOntologyTree.Term _5KB_downstream_variant = soTree.getTermByAcn("SO:0001633");
        final SequenceOntologyTree.Term _500bp_downstream_variant = soTree.getTermByAcn("SO:0001634");
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
        while (r.hasNext()) {
            final VariantContext ctx = progress.watch(r.next());
            final String normalizedContig = contigNameConverter.apply(ctx.getContig());
            final List<KnownGene> genes = new ArrayList<>();
            if (!StringUtil.isBlank(normalizedContig)) {
                for (final List<KnownGene> l2 : this.knownGenes.getOverlapping(new Interval(normalizedContig, ctx.getStart(), // 1-based
                ctx.getEnd()))) {
                    genes.addAll(l2);
                }
            }
            final List<Annotation> ctx_annotations = new ArrayList<Annotation>();
            if (genes == null || genes.isEmpty()) {
                // intergenic
                Annotation a = new Annotation();
                a.seqont.add(so_intergenic);
                ctx_annotations.add(a);
            } else {
                if (genomicSequence == null || !genomicSequence.hasName(normalizedContig)) {
                    LOG.info("getting genomic Sequence for " + normalizedContig);
                    genomicSequence = this.referenceGenome.getContig(normalizedContig);
                    if (genomicSequence == null)
                        throw new JvarkitException.ContigNotFoundInDictionary(normalizedContig, this.referenceGenome.getDictionary());
                }
                for (final KnownGene gene : genes) {
                    final GeneticCode geneticCode = GeneticCode.getStandard();
                    for (final Allele alt2 : ctx.getAlternateAlleles()) {
                        if (alt2.isNoCall())
                            continue;
                        if (alt2.isSymbolic()) {
                            LOG.warn("symbolic allele are not handled... " + alt2.getDisplayString());
                            continue;
                        }
                        if (alt2.isReference())
                            continue;
                        final Annotation annotations = new Annotation();
                        annotations.kg = gene;
                        annotations.alt2 = alt2;
                        if (gene.isNonCoding()) {
                            annotations.seqont.add(so_nc_transcript_variant);
                            continue;
                        }
                        ctx_annotations.add(annotations);
                        StringBuilder wildRNA = null;
                        ProteinCharSequence wildProt = null;
                        ProteinCharSequence mutProt = null;
                        MutedSequence mutRNA = null;
                        int position_in_cds = -1;
                        final int position = ctx.getStart() - 1;
                        if (!String.valueOf(genomicSequence.charAt(position)).equalsIgnoreCase(ctx.getReference().getBaseString())) {
                            if (isSimpleBase(ctx.getReference())) {
                                LOG.warn("Warning REF!=GENOMIC SEQ!!! at " + position + "/" + ctx.getReference());
                            }
                            continue;
                        }
                        if (gene.isPositiveStrand()) {
                            if (position < gene.getTxStart() - 2000) {
                                annotations.seqont.add(_5KB_upstream_variant);
                            } else if (position < gene.getTxStart()) {
                                annotations.seqont.add(_2KB_upstream_variant);
                            } else if (position >= gene.getTxEnd() + 500) {
                                annotations.seqont.add(_5KB_downstream_variant);
                            } else if (position >= gene.getTxEnd()) {
                                annotations.seqont.add(_500bp_downstream_variant);
                            } else if (position < gene.getCdsStart()) {
                                // UTR5
                                annotations.seqont.add(so_5_prime_UTR_variant);
                            } else if (gene.getCdsEnd() <= position) {
                                annotations.seqont.add(so_3_prime_UTR_variant);
                            } else {
                                int exon_index = 0;
                                while (exon_index < gene.getExonCount()) {
                                    final KnownGene.Exon exon = gene.getExon(exon_index);
                                    for (int i = exon.getStart(); i < exon.getEnd(); ++i) {
                                        if (i == position) {
                                            annotations.exon_name = exon.getName();
                                            if (exon.isNonCoding()) {
                                                annotations.seqont.add(so_non_coding_exon_variant);
                                            }
                                        }
                                        if (i < gene.getTxStart())
                                            continue;
                                        if (i < gene.getCdsStart())
                                            continue;
                                        if (i >= gene.getCdsEnd())
                                            break;
                                        if (wildRNA == null) {
                                            wildRNA = new StringBuilder();
                                            mutRNA = new MutedSequence(wildRNA);
                                        }
                                        if (i == position) {
                                            annotations.seqont.add(so_exon);
                                            annotations.exon_name = exon.getName();
                                            position_in_cds = wildRNA.length();
                                            annotations.position_cds = position_in_cds;
                                            // in splicing ?
                                            if (exon.isSplicing(position)) {
                                                if (exon.isSplicingAcceptor(position)) {
                                                    // SPLICING_ACCEPTOR
                                                    annotations.seqont.add(so_splice_acceptor);
                                                } else if (exon.isSplicingDonor(position)) {
                                                    // SPLICING_DONOR
                                                    annotations.seqont.add(so_splice_donor);
                                                } else // ??
                                                {
                                                    annotations.seqont.add(so_splicing_variant);
                                                }
                                            }
                                        }
                                        wildRNA.append(genomicSequence.charAt(i));
                                        if (i == position && isSimpleBase(alt2) && isSimpleBase(ctx.getReference())) {
                                            mutRNA.put(position_in_cds, alt2.getBaseString().charAt(0));
                                        }
                                        if (wildRNA.length() % 3 == 0 && wildRNA.length() > 0 && wildProt == null) {
                                            wildProt = new ProteinCharSequence(geneticCode, wildRNA);
                                            mutProt = new ProteinCharSequence(geneticCode, mutRNA);
                                        }
                                    }
                                    final KnownGene.Intron intron = exon.getNextIntron();
                                    if (intron != null && intron.contains(position)) {
                                        annotations.intron_name = intron.getName();
                                        annotations.seqont.add(so_intron);
                                        if (intron.isSplicing(position)) {
                                            if (intron.isSplicingAcceptor(position)) {
                                                annotations.seqont.add(so_splice_acceptor);
                                            } else if (intron.isSplicingDonor(position)) {
                                                annotations.seqont.add(so_splice_donor);
                                            } else // ???
                                            {
                                                annotations.seqont.add(so_splicing_variant);
                                            }
                                        }
                                    }
                                    ++exon_index;
                                }
                            }
                        } else // reverse orientation
                        {
                            if (position >= gene.getTxEnd() + 2000) {
                                annotations.seqont.add(_5KB_upstream_variant);
                            } else if (position >= gene.getTxEnd()) {
                                annotations.seqont.add(_2KB_upstream_variant);
                            } else if (position < gene.getTxStart() - 500) {
                                annotations.seqont.add(_5KB_downstream_variant);
                            } else if (position < gene.getTxStart()) {
                                annotations.seqont.add(_500bp_downstream_variant);
                            } else if (position < gene.getCdsStart()) {
                                annotations.seqont.add(so_3_prime_UTR_variant);
                            } else if (gene.getCdsEnd() <= position) {
                                annotations.seqont.add(so_5_prime_UTR_variant);
                            } else {
                                int exon_index = gene.getExonCount() - 1;
                                while (exon_index >= 0) {
                                    final KnownGene.Exon exon = gene.getExon(exon_index);
                                    for (int i = exon.getEnd() - 1; i >= exon.getStart(); --i) {
                                        if (i == position) {
                                            annotations.exon_name = exon.getName();
                                            if (exon.isNonCoding()) {
                                                annotations.seqont.add(so_non_coding_exon_variant);
                                            }
                                        }
                                        if (i >= gene.getCdsEnd())
                                            continue;
                                        if (i < gene.getCdsStart())
                                            break;
                                        if (wildRNA == null) {
                                            wildRNA = new StringBuilder();
                                            mutRNA = new MutedSequence(wildRNA);
                                        }
                                        if (i == position) {
                                            annotations.seqont.add(so_exon);
                                            position_in_cds = wildRNA.length();
                                            annotations.position_cds = position_in_cds;
                                            // in splicing ?
                                            if (exon.isSplicing(position)) {
                                                if (exon.isSplicingAcceptor(position)) {
                                                    annotations.seqont.add(so_splice_acceptor);
                                                } else if (exon.isSplicingDonor(position)) {
                                                    annotations.seqont.add(so_splice_donor);
                                                } else // ?
                                                {
                                                    annotations.seqont.add(so_splicing_variant);
                                                }
                                            }
                                            if (isSimpleBase(alt2) && isSimpleBase(ctx.getReference())) {
                                                mutRNA.put(position_in_cds, AcidNucleics.complement(alt2.getBaseString().charAt(0)));
                                            }
                                        }
                                        wildRNA.append(AcidNucleics.complement(genomicSequence.charAt(i)));
                                        if (wildRNA.length() % 3 == 0 && wildRNA.length() > 0 && wildProt == null) {
                                            wildProt = new ProteinCharSequence(geneticCode, wildRNA);
                                            mutProt = new ProteinCharSequence(geneticCode, mutRNA);
                                        }
                                    }
                                    final KnownGene.Intron intron = exon.getPrevIntron();
                                    if (intron != null && intron.contains(position)) {
                                        annotations.intron_name = intron.getName();
                                        annotations.seqont.add(so_intron);
                                        if (intron.isSplicing(position)) {
                                            if (intron.isSplicingAcceptor(position)) {
                                                annotations.seqont.add(so_splice_acceptor);
                                            } else if (intron.isSplicingDonor(position)) {
                                                annotations.seqont.add(so_splice_donor);
                                            } else // ?
                                            {
                                                annotations.seqont.add(so_splicing_variant);
                                            }
                                        }
                                    }
                                    --exon_index;
                                }
                            }
                        }
                        if (isSimpleBase(alt2) && isSimpleBase(ctx.getReference()) && wildProt != null && mutProt != null && position_in_cds >= 0) {
                            final int pos_aa = position_in_cds / 3;
                            final int mod = position_in_cds % 3;
                            annotations.wildCodon = ("" + wildRNA.charAt(position_in_cds - mod + 0) + wildRNA.charAt(position_in_cds - mod + 1) + wildRNA.charAt(position_in_cds - mod + 2));
                            annotations.mutCodon = ("" + mutRNA.charAt(position_in_cds - mod + 0) + mutRNA.charAt(position_in_cds - mod + 1) + mutRNA.charAt(position_in_cds - mod + 2));
                            annotations.position_protein = (pos_aa + 1);
                            annotations.wildAA = String.valueOf(wildProt.charAt(pos_aa));
                            annotations.mutAA = (String.valueOf(mutProt.charAt(pos_aa)));
                            annotations.seqont.remove(so_exon);
                            if (isStop(wildProt.charAt(pos_aa)) && !isStop(mutProt.charAt(pos_aa))) {
                                annotations.seqont.add(so_stop_lost);
                            } else if (!isStop(wildProt.charAt(pos_aa)) && isStop(mutProt.charAt(pos_aa))) {
                                annotations.seqont.add(so_stop_gained);
                            } else if (wildProt.charAt(pos_aa) == mutProt.charAt(pos_aa)) {
                                annotations.seqont.add(so_coding_synonymous);
                            } else {
                                annotations.seqont.add(so_coding_non_synonymous);
                            }
                        }
                    }
                }
            }
            final Set<String> info = new HashSet<String>(ctx_annotations.size());
            for (final Annotation a : ctx_annotations) {
                info.add(a.toString());
            }
            final VariantContextBuilder vb = new VariantContextBuilder(ctx);
            final String thetag;
            switch(this.outputSyntax) {
                case Vep:
                    thetag = "CSQ";
                    break;
                case SnpEff:
                    thetag = "ANN";
                    break;
                default:
                    thetag = TAG;
                    break;
            }
            vb.attribute(thetag, info.toArray());
            w.add(vb.make());
        }
        return RETURN_OK;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(this.referenceGenome);
    }
}
Also used : ReferenceContig(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceContig) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFHeader(htsjdk.variant.vcf.VCFHeader) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) HashSet(java.util.HashSet) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) ReferenceGenomeFactory(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenomeFactory) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) IOException(java.io.IOException) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) SequenceOntologyTree(com.github.lindenb.jvarkit.util.so.SequenceOntologyTree) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) GeneticCode(com.github.lindenb.jvarkit.util.bio.GeneticCode) Interval(htsjdk.samtools.util.Interval)

Example 2 with SequenceOntologyTree

use of com.github.lindenb.jvarkit.util.so.SequenceOntologyTree in project jvarkit by lindenb.

the class VcfFilterSequenceOntology method readSequenceOntologyTree.

/**
 * return SequenceOntologyTree
 *
 * @param owluri if null, use getInstance
 * @return a SequenceOntologyTree
 * @throws IOException
 */
private static SequenceOntologyTree readSequenceOntologyTree(final String owluri) throws IOException {
    if (!StringUtil.isBlank(owluri)) {
        LOG.info("loading 'SO' tree from " + owluri);
        SequenceOntologyTree sequenceOntologyTree = SequenceOntologyTree.fromUri(owluri.trim());
        LOG.info("Done loading SO.");
        return sequenceOntologyTree;
    } else {
        return SequenceOntologyTree.getInstance();
    }
}
Also used : SequenceOntologyTree(com.github.lindenb.jvarkit.util.so.SequenceOntologyTree)

Example 3 with SequenceOntologyTree

use of com.github.lindenb.jvarkit.util.so.SequenceOntologyTree in project jvarkit by lindenb.

the class VcfFilterSequenceOntology method doWork.

@Override
public int doWork(final List<String> args) {
    try {
        if (this.showList) {
            final SequenceOntologyTree tree = readSequenceOntologyTree(this.component.owluri);
            final PrintWriter pw = super.openFileOrStdoutAsPrintWriter(this.outputFile);
            for (final SequenceOntologyTree.Term t : tree.getTerms()) {
                pw.println(t.getAcn() + "\t" + t.getLabel());
            }
            pw.flush();
            pw.close();
            return 0;
        }
        if (this.component.initialize() != 0)
            return -1;
        return doVcfToVcf(args, this.outputFile);
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(this.component);
    }
}
Also used : SequenceOntologyTree(com.github.lindenb.jvarkit.util.so.SequenceOntologyTree) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) IOException(java.io.IOException) PrintWriter(java.io.PrintWriter)

Example 4 with SequenceOntologyTree

use of com.github.lindenb.jvarkit.util.so.SequenceOntologyTree in project jvarkit by lindenb.

the class VcfBurden method doWork2.

private int doWork2(List<String> args) {
    ZipOutputStream zout = null;
    FileOutputStream fout = null;
    VcfIterator in = null;
    try {
        if (args.isEmpty()) {
            LOG.info("reading from stdin.");
            in = VCFUtils.createVcfIteratorStdin();
        } else if (args.size() == 1) {
            String filename = args.get(0);
            LOG.info("reading from " + filename);
            in = VCFUtils.createVcfIterator(filename);
        } else {
            LOG.error("Illegal number of arguments.");
            return -1;
        }
        if (outputFile == null) {
            LOG.error("undefined output");
            return -1;
        } else if (!outputFile.getName().endsWith(".zip")) {
            LOG.error("output " + outputFile + " should end with .zip");
            return -1;
        } else {
            fout = new FileOutputStream(outputFile);
            zout = new ZipOutputStream(fout);
        }
        final List<String> samples = in.getHeader().getSampleNamesInOrder();
        final VCFHeader header = in.getHeader();
        String prev_chrom = null;
        final VepPredictionParser vepPredParser = new VepPredictionParserFactory().header(header).get();
        final Map<GeneTranscript, List<VariantAndCsq>> gene2variants = new HashMap<>();
        final SequenceOntologyTree soTree = SequenceOntologyTree.getInstance();
        final Set<SequenceOntologyTree.Term> acn = new HashSet<>();
        /* mail solena  *SO en remplacement des SO actuels (VEP HIGH + MODERATE) - pas la peine de faire retourner les analyses mais servira pour les futures analyses burden */
        String[] acn_list = new String[] { "SO:0001893", "SO:0001574", "SO:0001575", "SO:0001587", "SO:0001589", "SO:0001578", "SO:0002012", "SO:0001889", "SO:0001821", "SO:0001822", "SO:0001583", "SO:0001818" /*
					"SO:0001589", "SO:0001587", "SO:0001582", "SO:0001583",
					"SO:0001575", "SO:0001578", "SO:0001574", "SO:0001889",
					"SO:0001821", "SO:0001822", "SO:0001893"*/
        };
        if (this.highdamage) {
            acn_list = new String[] { "SO:0001893", "SO:0001574", "SO:0001575", "SO:0001587", "SO:0001589", "SO:0001578", "SO:0002012", "SO:0001889" };
        }
        for (final String acns : acn_list) {
            final SequenceOntologyTree.Term tacn = soTree.getTermByAcn(acns);
            if (tacn == null) {
                in.close();
                throw new NullPointerException("tacn == null pour " + acns);
            }
            acn.addAll(tacn.getAllDescendants());
        }
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(in.getHeader());
        for (; ; ) {
            VariantContext ctx1 = null;
            if (in.hasNext()) {
                ctx1 = progress.watch(in.next());
                if (ctx1.getAlternateAlleles().size() != 1) {
                    // info("count(ALT)!=1 in "+ctx1.getChr()+":"+ctx1.getStart());
                    continue;
                }
                if (ctx1.getAlternateAlleles().get(0).equals(Allele.SPAN_DEL)) {
                    continue;
                }
            }
            if (ctx1 == null || !ctx1.getContig().equals(prev_chrom)) {
                LOG.info("DUMP to zip n=" + gene2variants.size());
                final Set<String> geneNames = new HashSet<>();
                for (GeneTranscript gene_transcript : gene2variants.keySet()) {
                    geneNames.add(gene_transcript.geneName);
                    final Set<VariantAndCsq> uniq = new TreeSet<>(this.variantAndCsqComparator);
                    uniq.addAll(gene2variants.get(gene_transcript));
                    dumpVariants(zout, prev_chrom, gene_transcript.geneName + "_" + gene_transcript.transcriptName, samples, new ArrayList<VariantAndCsq>(uniq));
                    LOG.info("dumped" + gene_transcript.geneName);
                }
                LOG.info("loop over geneName");
                for (final String geneName : geneNames) {
                    final SortedSet<VariantAndCsq> lis_nm = new TreeSet<>(this.variantAndCsqComparator);
                    final SortedSet<VariantAndCsq> lis_all = new TreeSet<>(this.variantAndCsqComparator);
                    final SortedSet<VariantAndCsq> lis_refseq = new TreeSet<>(this.variantAndCsqComparator);
                    final SortedSet<VariantAndCsq> lis_enst = new TreeSet<>(this.variantAndCsqComparator);
                    LOG.info("loop over gene2variants");
                    for (final GeneTranscript gene_transcript : gene2variants.keySet()) {
                        if (!geneName.equals(gene_transcript.geneName))
                            continue;
                        lis_all.addAll(gene2variants.get(gene_transcript));
                        if (gene_transcript.transcriptName.startsWith("NM_")) {
                            lis_nm.addAll(gene2variants.get(gene_transcript));
                        }
                        if (!gene_transcript.transcriptName.startsWith("ENST")) {
                            lis_refseq.addAll(gene2variants.get(gene_transcript));
                        }
                        if (gene_transcript.transcriptName.startsWith("ENST")) {
                            lis_enst.addAll(gene2variants.get(gene_transcript));
                        }
                    }
                    LOG.info("dump_ALL_TRANSCRIPTS");
                    dumpVariants(zout, prev_chrom, geneName + "_ALL_TRANSCRIPTS", samples, new ArrayList<VariantAndCsq>(lis_all));
                    LOG.info("dump_ALL_NM");
                    dumpVariants(zout, prev_chrom, geneName + "_ALL_NM", samples, new ArrayList<VariantAndCsq>(lis_nm));
                    LOG.info("dump_ALL_REFSEQ");
                    dumpVariants(zout, prev_chrom, geneName + "_ALL_REFSEQ", samples, new ArrayList<VariantAndCsq>(lis_refseq));
                    LOG.info("dump_ALL_ENST");
                    dumpVariants(zout, prev_chrom, geneName + "_ALL_ENST", samples, new ArrayList<VariantAndCsq>(lis_enst));
                }
                if (ctx1 == null)
                    break;
                LOG.info("gene2variants");
                gene2variants.clear();
                LOG.info("System.gc();");
                System.gc();
                prev_chrom = ctx1.getContig();
                LOG.info("prev_chrom=" + prev_chrom);
            }
            final Set<GeneTranscript> seen_names = new HashSet<>();
            for (final VepPredictionParser.VepPrediction pred : vepPredParser.getPredictions(ctx1)) {
                String geneName = pred.getSymbol();
                if (geneName == null || geneName.trim().isEmpty())
                    continue;
                if (this._gene2seen != null) {
                    if (!this._gene2seen.containsKey(geneName))
                        continue;
                }
                final String transcriptName = pred.getFeature();
                if (transcriptName == null || transcriptName.trim().isEmpty()) {
                    LOG.info("No transcript in " + ctx1);
                    continue;
                }
                final GeneTranscript geneTranscript = new GeneTranscript(geneName, transcriptName);
                if (seen_names.contains(geneTranscript))
                    continue;
                boolean ok = false;
                for (SequenceOntologyTree.Term so : pred.getSOTerms()) {
                    if (acn.contains(so)) {
                        ok = true;
                    }
                }
                if (!printSOTerms && !ok)
                    continue;
                List<VariantAndCsq> L = gene2variants.get(geneTranscript);
                if (L == null) {
                    L = new ArrayList<>();
                    gene2variants.put(geneTranscript, L);
                }
                Float vqslod = null;
                if (this.printVQSLOD && ctx1.hasAttribute("VQSLOD")) {
                    vqslod = (float) ctx1.getAttributeAsDouble("VQSLOD", -9999999.0);
                }
                L.add(new VariantAndCsq(ctx1, pred.getSOTerms(), pred.getPositionInCDS(), vqslod));
                seen_names.add(geneTranscript);
                if (this._gene2seen != null) {
                    this._gene2seen.put(geneTranscript.geneName, Boolean.TRUE);
                }
            }
        }
        if (this._gene2seen != null) {
            final List<VariantAndCsq> emptylist = Collections.emptyList();
            for (final String gene : this._gene2seen.keySet()) {
                if (this._gene2seen.get(gene).equals(Boolean.TRUE))
                    continue;
                LOG.warning("Gene not found : " + gene);
                dumpVariants(zout, "UNDEFINED", gene + "_000000000000000.txt", samples, emptylist);
            }
        }
        progress.finish();
        zout.finish();
        fout.flush();
        zout.flush();
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(in);
        CloserUtil.close(zout);
        CloserUtil.close(fout);
    }
}
Also used : HashMap(java.util.HashMap) VariantContext(htsjdk.variant.variantcontext.VariantContext) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) List(java.util.List) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) IOException(java.io.IOException) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) ZipOutputStream(java.util.zip.ZipOutputStream) FileOutputStream(java.io.FileOutputStream) SequenceOntologyTree(com.github.lindenb.jvarkit.util.so.SequenceOntologyTree) VepPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory)

Example 5 with SequenceOntologyTree

use of com.github.lindenb.jvarkit.util.so.SequenceOntologyTree in project jvarkit by lindenb.

the class TestNg01 method testVcfFilterSo.

@Test
public void testVcfFilterSo() throws IOException {
    File output = new File(TEST_RESULTS_DIR, "jeter.filrerso.vcf");
    final AnnPredictionParser parser = new AnnPredictionParserFactory().createDefaultParser();
    final SequenceOntologyTree tree = SequenceOntologyTree.getInstance();
    String acn = "SO:0001583";
    final SequenceOntologyTree.Term term = tree.getTermByAcn(acn);
    final Set<SequenceOntologyTree.Term> terms = term.getAllDescendants();
    Assert.assertNotNull(term);
    Assert.assertTrue(terms.size() > 1);
    Assert.assertTrue(terms.contains(term));
    Assert.assertEquals(0, new VcfFilterSequenceOntology().instanceMain(new String[] { "-o", output.getPath(), "-A", acn, VCF01 }));
    streamVcf(output).forEach(V -> {
        // System.err.println(V.getAttribute("ANN")+" vs "+ terms);
        Assert.assertTrue(parser.getPredictions(V).stream().flatMap(P -> P.getSOTerms().stream()).anyMatch(T -> terms.contains(T)));
    });
    Assert.assertEquals(0, new VcfFilterSequenceOntology().instanceMain(new String[] { "-o", output.getPath(), "-A", acn, "--rmatt", "--invert", VCF01 }));
    streamVcf(output).forEach(V -> {
        Assert.assertFalse(parser.getPredictions(V).stream().flatMap(P -> P.getSOTerms().stream()).anyMatch(T -> terms.contains(T)));
    });
    Assert.assertEquals(0, new VcfFilterSequenceOntology().instanceMain(new String[] { "-o", output.getPath(), "-A", acn, "--rmatt", VCF01 }));
    Assert.assertTrue(streamVcf(output).findAny().isPresent());
    Assert.assertTrue(output.delete());
}
Also used : AnnPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser) Arrays(java.util.Arrays) VCFFileReader(htsjdk.variant.vcf.VCFFileReader) VCFHeader(htsjdk.variant.vcf.VCFHeader) VcfFilterJdk(com.github.lindenb.jvarkit.tools.vcffilterjs.VcfFilterJdk) Test(org.testng.annotations.Test) VcfBurdenFisherH(com.github.lindenb.jvarkit.tools.burden.VcfBurdenFisherH) Vcf2Xml(com.github.lindenb.jvarkit.tools.vcf2xml.Vcf2Xml) VcfInjectPedigree(com.github.lindenb.jvarkit.tools.burden.VcfInjectPedigree) VcfToTable(com.github.lindenb.jvarkit.tools.misc.VcfToTable) ReferenceGenomeFactory(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenomeFactory) VcfMultiToOneAllele(com.github.lindenb.jvarkit.tools.misc.VcfMultiToOneAllele) SAXParser(javax.xml.parsers.SAXParser) VcfNoCallToHomRef(com.github.lindenb.jvarkit.tools.misc.VcfNoCallToHomRef) VcfBurdenFisherV(com.github.lindenb.jvarkit.tools.burden.VcfBurdenFisherV) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) VcfBurdenFilterGenes(com.github.lindenb.jvarkit.tools.burden.VcfBurdenFilterGenes) Set(java.util.Set) PadEmptyFastq(com.github.lindenb.jvarkit.tools.misc.PadEmptyFastq) VcfMultiToOne(com.github.lindenb.jvarkit.tools.onesamplevcf.VcfMultiToOne) VcfOffsetsIndexFactory(com.github.lindenb.jvarkit.tools.vcflist.VcfOffsetsIndexFactory) Stream(java.util.stream.Stream) VcfBurdenFilterExac(com.github.lindenb.jvarkit.tools.burden.VcfBurdenFilterExac) VcfMoveFiltersToInfo(com.github.lindenb.jvarkit.tools.burden.VcfMoveFiltersToInfo) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Bam2Raster(com.github.lindenb.jvarkit.tools.bam2graphics.Bam2Raster) SortVcfOnInfo(com.github.lindenb.jvarkit.tools.sortvcfonref.SortVcfOnInfo) TrapIndexer(com.github.lindenb.jvarkit.tools.trap.TrapIndexer) IterableAdapter(htsjdk.samtools.util.IterableAdapter) VcfTrap(com.github.lindenb.jvarkit.tools.trap.VcfTrap) ArrayList(java.util.ArrayList) FixVcfMissingGenotypes(com.github.lindenb.jvarkit.tools.misc.FixVcfMissingGenotypes) Assert(org.testng.Assert) SequenceOntologyTree(com.github.lindenb.jvarkit.util.so.SequenceOntologyTree) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) VcfRebase(com.github.lindenb.jvarkit.tools.vcfrebase.VcfRebase) StreamSupport(java.util.stream.StreamSupport) Properties(java.util.Properties) Files(java.nio.file.Files) VCFBigWig(com.github.lindenb.jvarkit.tools.vcfbigwig.VCFBigWig) VcfGnomad(com.github.lindenb.jvarkit.tools.gnomad.VcfGnomad) IOException(java.io.IOException) File(java.io.File) DefaultHandler(org.xml.sax.helpers.DefaultHandler) VcfSetSequenceDictionary(com.github.lindenb.jvarkit.tools.misc.VcfSetSequenceDictionary) VcfCreateDictionary(com.github.lindenb.jvarkit.tools.misc.VcfCreateDictionary) NgsFilesSummary(com.github.lindenb.jvarkit.tools.ngsfiles.NgsFilesSummary) VcfBurdenRscriptV(com.github.lindenb.jvarkit.tools.burden.VcfBurdenRscriptV) AnnPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParserFactory) BufferedReader(java.io.BufferedReader) GroupByGene(com.github.lindenb.jvarkit.tools.groupbygene.GroupByGene) VCFFamilies(com.github.lindenb.jvarkit.tools.vcftrios.VCFFamilies) ReferenceGenome(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenome) Algorithms(com.github.lindenb.jvarkit.util.Algorithms) Biostar84452(com.github.lindenb.jvarkit.tools.biostar.Biostar84452) VCFTrios(com.github.lindenb.jvarkit.tools.vcftrios.VCFTrios) URL(java.net.URL) LowResBam2Raster(com.github.lindenb.jvarkit.tools.bam2graphics.LowResBam2Raster) VCFBed(com.github.lindenb.jvarkit.tools.vcfbed.VCFBed) Random(java.util.Random) VcfToSql(com.github.lindenb.jvarkit.tools.vcf2sql.VcfToSql) MiniCaller(com.github.lindenb.jvarkit.tools.calling.MiniCaller) GoUtils(com.github.lindenb.jvarkit.tools.misc.GoUtils) FindAVariation(com.github.lindenb.jvarkit.tools.misc.FindAVariation) VcfXmlAmalgamation(com.github.lindenb.jvarkit.tools.vcfamalgation.VcfXmlAmalgamation) ImageIO(javax.imageio.ImageIO) BamToSql(com.github.lindenb.jvarkit.tools.misc.BamToSql) Gff2KnownGene(com.github.lindenb.jvarkit.tools.misc.Gff2KnownGene) VCFFixIndels(com.github.lindenb.jvarkit.tools.vcffixindels.VCFFixIndels) VCFStripAnnotations(com.github.lindenb.jvarkit.tools.vcfstripannot.VCFStripAnnotations) BufferedImage(java.awt.image.BufferedImage) Predicate(java.util.function.Predicate) BeforeClass(org.testng.annotations.BeforeClass) Collectors(java.util.stream.Collectors) IlluminaReadName(com.github.lindenb.jvarkit.tools.misc.IlluminaReadName) List(java.util.List) BackLocate(com.github.lindenb.jvarkit.tools.backlocate.BackLocate) VcfToSvg(com.github.lindenb.jvarkit.tools.misc.VcfToSvg) VariantContext(htsjdk.variant.variantcontext.VariantContext) Pattern(java.util.regex.Pattern) FastaSequenceReader(com.github.lindenb.jvarkit.util.bio.fasta.FastaSequenceReader) VcfList(com.github.lindenb.jvarkit.tools.vcflist.VcfList) SamReaderFactory(htsjdk.samtools.SamReaderFactory) VcfFilterNotInPedigree(com.github.lindenb.jvarkit.tools.burden.VcfFilterNotInPedigree) VcfFilterSequenceOntology(com.github.lindenb.jvarkit.tools.vcffilterso.VcfFilterSequenceOntology) FindAllCoverageAtPosition(com.github.lindenb.jvarkit.tools.misc.FindAllCoverageAtPosition) VcfToRdf(com.github.lindenb.jvarkit.tools.vcf2rdf.VcfToRdf) DataProvider(org.testng.annotations.DataProvider) AnnPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser) SAXParserFactory(javax.xml.parsers.SAXParserFactory) VcfLoopOverGenes(com.github.lindenb.jvarkit.tools.burden.VcfLoopOverGenes) VcfToHilbert(com.github.lindenb.jvarkit.tools.hilbert.VcfToHilbert) VcfStats(com.github.lindenb.jvarkit.tools.vcfstats.VcfStats) VcfRemoveUnusedAlt(com.github.lindenb.jvarkit.tools.misc.VcfRemoveUnusedAlt) FileInputStream(java.io.FileInputStream) CaseControlCanvas(com.github.lindenb.jvarkit.tools.burden.CaseControlCanvas) VcfCompareCallers(com.github.lindenb.jvarkit.tools.vcfcmp.VcfCompareCallers) ReferenceContig(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceContig) SamReader(htsjdk.samtools.SamReader) FastqShuffle(com.github.lindenb.jvarkit.tools.fastq.FastqShuffle) BamTile(com.github.lindenb.jvarkit.tools.misc.BamTile) VcfBurdenMAF(com.github.lindenb.jvarkit.tools.burden.VcfBurdenMAF) FastaSequence(com.github.lindenb.jvarkit.util.bio.fasta.FastaSequence) ConcatSam(com.github.lindenb.jvarkit.tools.misc.ConcatSam) Bam2Wig(com.github.lindenb.jvarkit.tools.bam2wig.Bam2Wig) FileReader(java.io.FileReader) VcfFilterSequenceOntology(com.github.lindenb.jvarkit.tools.vcffilterso.VcfFilterSequenceOntology) AnnPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParserFactory) SequenceOntologyTree(com.github.lindenb.jvarkit.util.so.SequenceOntologyTree) File(java.io.File) Test(org.testng.annotations.Test)

Aggregations

SequenceOntologyTree (com.github.lindenb.jvarkit.util.so.SequenceOntologyTree)5 IOException (java.io.IOException)4 VariantContext (htsjdk.variant.variantcontext.VariantContext)3 VCFHeader (htsjdk.variant.vcf.VCFHeader)3 ArrayList (java.util.ArrayList)3 JvarkitException (com.github.lindenb.jvarkit.lang.JvarkitException)2 List (java.util.List)2 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)1 BackLocate (com.github.lindenb.jvarkit.tools.backlocate.BackLocate)1 Bam2Raster (com.github.lindenb.jvarkit.tools.bam2graphics.Bam2Raster)1 LowResBam2Raster (com.github.lindenb.jvarkit.tools.bam2graphics.LowResBam2Raster)1 Bam2Wig (com.github.lindenb.jvarkit.tools.bam2wig.Bam2Wig)1 Biostar84452 (com.github.lindenb.jvarkit.tools.biostar.Biostar84452)1 CaseControlCanvas (com.github.lindenb.jvarkit.tools.burden.CaseControlCanvas)1 VcfBurdenFilterExac (com.github.lindenb.jvarkit.tools.burden.VcfBurdenFilterExac)1 VcfBurdenFilterGenes (com.github.lindenb.jvarkit.tools.burden.VcfBurdenFilterGenes)1 VcfBurdenFisherH (com.github.lindenb.jvarkit.tools.burden.VcfBurdenFisherH)1 VcfBurdenFisherV (com.github.lindenb.jvarkit.tools.burden.VcfBurdenFisherV)1 VcfBurdenMAF (com.github.lindenb.jvarkit.tools.burden.VcfBurdenMAF)1 VcfBurdenRscriptV (com.github.lindenb.jvarkit.tools.burden.VcfBurdenRscriptV)1