Examples with KnownGene - com.github.lindenb.jvarkit.util.ucsc.KnownGene

Example 1 with KnownGene

use of com.github.lindenb.jvarkit.util.ucsc.KnownGene in project jvarkit by lindenb.

the class BackLocate method loadKnownGenesFromUri.

private void loadKnownGenesFromUri(String kgURI) throws IOException {
    if (this.referenceGenome.getDictionary() == null) {
        throw new JvarkitException.FastaDictionaryMissing("No sequence dictionary in " + this.indexedRefUri);
    }
    LOG.info("loading genes");
    final Set<String> unknown = new HashSet<String>();
    BufferedReader in = IOUtils.openURIForBufferedReading(kgURI);
    String line;
    final Pattern tab = Pattern.compile("[\t]");
    while ((line = in.readLine()) != null) {
        if (line.isEmpty())
            continue;
        final String[] tokens = tab.split(line);
        final KnownGene g = new KnownGene(tokens);
        final Interval rgn = new Interval(g.getContig(), g.getTxStart() + 1, g.getTxEnd());
        if (this.referenceGenome.getDictionary().getSequence(rgn.getContig()) == null) {
            if (!unknown.contains(g.getContig())) {
                LOG.warn("The reference doesn't contain chromosome " + g.getContig());
                unknown.add(g.getContig());
            }
            continue;
        }
        this.knwonGenes.put(g.getName(), g);
    }
    in.close();
    LOG.info("genes:" + this.knwonGenes.size());
}

Also used : Pattern(java.util.regex.Pattern) BufferedReader(java.io.BufferedReader) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) HashSet(java.util.HashSet) Interval(htsjdk.samtools.util.Interval)

Example 2 with KnownGene

use of com.github.lindenb.jvarkit.util.ucsc.KnownGene in project jvarkit by lindenb.

the class Biostar81455 method doWork.

@Override
public int doWork(List<String> args) {
    BufferedReader r = null;
    String line;
    PrintStream out = null;
    final Pattern tab = Pattern.compile("[\t]");
    try {
        if (this.kgUri == null || this.kgUri.trim().isEmpty()) {
            LOG.error("undefined kguri");
            return -1;
        }
        LOG.info("readubf " + kgUri);
        this.kgMap = KnownGene.loadUriAsIntervalTreeMap(this.kgUri, (kg) -> kg.getExonCount() != 0);
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(r);
    }
    try {
        r = super.openBufferedReader(oneFileOrNull(args));
        out = openFileOrStdoutAsPrintStream(this.outputFile);
        while ((line = r.readLine()) != null) {
            if (line.startsWith("#")) {
                out.println(line);
                continue;
            }
            boolean found = false;
            String[] tokens = tab.split(line);
            int pos0 = Integer.parseInt(tokens[1]);
            final IntervalTree<List<KnownGene>> kgs = kgMap.debugGetTree(tokens[0]);
            if (kgs == null) {
                LOG.info("no gene found in chromosome " + tokens[0] + " (check chrom prefix?)");
            } else {
                KnownGene bestGene = null;
                for (Iterator<IntervalTree.Node<List<KnownGene>>> iter = kgs.iterator(); iter.hasNext(); ) {
                    for (KnownGene kg : iter.next().getValue()) {
                        if (bestGene == null || Math.abs(distance(pos0, kg)) < Math.abs(distance(pos0, bestGene))) {
                            bestGene = kg;
                        }
                    }
                }
                if (bestGene != null) {
                    // get all transcritpts
                    final List<KnownGene> overlapKg = new ArrayList<>();
                    for (final List<KnownGene> lkg : kgMap.getOverlapping(new Interval(bestGene.getChromosome(), bestGene.getTxStart() + 1, bestGene.getTxEnd()))) {
                        overlapKg.addAll(lkg);
                    }
                    for (final KnownGene kg : overlapKg) {
                        KnownGene.Exon bestExon = null;
                        for (KnownGene.Exon exon : kg.getExons()) {
                            if (bestExon == null || Math.abs(distance(pos0, exon)) < Math.abs(distance(pos0, bestExon))) {
                                bestExon = exon;
                            }
                        }
                        if (bestExon != null) {
                            out.println(line + "\t" + kg.getName() + "\t" + kg.getTxStart() + "\t" + kg.getTxEnd() + "\t" + kg.getStrand() + "\t" + bestExon.getName() + "\t" + bestExon.getStart() + "\t" + bestExon.getEnd() + "\t" + distance(pos0, bestExon));
                            found = true;
                        }
                    }
                }
            }
            if (!found) {
                out.println(line + "\tNULL");
            }
        }
        return RETURN_OK;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(r);
        CloserUtil.close(out);
    }
}

Also used : PrintStream(java.io.PrintStream) Iterator(java.util.Iterator) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Logger(com.github.lindenb.jvarkit.util.log.Logger) IntervalTree(htsjdk.samtools.util.IntervalTree) Term(com.github.lindenb.semontology.Term) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) File(java.io.File) ArrayList(java.util.ArrayList) Interval(htsjdk.samtools.util.Interval) List(java.util.List) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) BufferedReader(java.io.BufferedReader) Pattern(java.util.regex.Pattern) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintStream(java.io.PrintStream) Pattern(java.util.regex.Pattern) ArrayList(java.util.ArrayList) BufferedReader(java.io.BufferedReader) ArrayList(java.util.ArrayList) List(java.util.List) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) Interval(htsjdk.samtools.util.Interval)

Example 3 with KnownGene

use of com.github.lindenb.jvarkit.util.ucsc.KnownGene in project jvarkit by lindenb.

the class VcfDoest method run.

private void run(final LineIterator lr, final PrintWriter pw) throws IOException {
    SortingCollection<TranscriptInfo> sorting = null;
    CloseableIterator<TranscriptInfo> iter2 = null;
    try {
        while (lr.hasNext()) {
            VcfIterator in = VCFUtils.createVcfIteratorFromLineIterator(lr, true);
            final VCFHeader header = in.getHeader();
            final Pedigree pedigree = Pedigree.newParser().parse(header);
            if (pedigree.isEmpty()) {
                throw new IOException("No pedigree found in header VCF header. use VcfInjectPedigree to add it");
            }
            final SortedSet<Pedigree.Person> individuals = new TreeSet<>();
            for (final Pedigree.Person individual : pedigree.getPersons()) {
                if (individual.isAffected() || individual.isUnaffected()) {
                    individuals.add(individual);
                }
            }
            boolean first = true;
            pw.println("# samples ( 0: unaffected 1:affected)");
            pw.print("population <- data.frame(family=c(");
            first = true;
            for (final Pedigree.Person person : individuals) {
                if (!first)
                    pw.print(",");
                pw.print("\"" + person.getFamily().getId() + "\"");
                first = false;
            }
            pw.print("),name=c(");
            first = true;
            for (final Pedigree.Person person : individuals) {
                if (!first)
                    pw.print(",");
                pw.print("\"" + person.getId() + "\"");
                first = false;
            }
            pw.print("),status=c(");
            first = true;
            for (final Pedigree.Person person : individuals) {
                if (!first)
                    pw.print(",");
                pw.print(person.isUnaffected() ? 0 : 1);
                first = false;
            }
            pw.println("))");
            sorting = SortingCollection.newInstance(TranscriptInfo.class, new TranscriptInfoCodec(), new TranscriptInfoCmp(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
            sorting.setDestructiveIteration(true);
            final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(header.getSequenceDictionary());
            /**
             * loop over variants
             */
            while (in.hasNext() && !pw.checkError()) {
                final VariantContext ctx = progess.watch(in.next());
                if (ctx.isFiltered())
                    continue;
                if (ctx.getAlternateAlleles().isEmpty())
                    continue;
                final Allele altAllele = ctx.getAltAlleleWithHighestAlleleCount();
                final MafCalculator mafCalculator = new MafCalculator(altAllele, ctx.getContig());
                boolean genotyped = false;
                for (final Pedigree.Person p : pedigree.getPersons()) {
                    if (!(p.isAffected() || p.isUnaffected()))
                        continue;
                    final Genotype g = ctx.getGenotype(p.getId());
                    if (g == null)
                        throw new IOException("Strange I cannot find individual " + p + " in the pedigree. Aborting.");
                    if (g.isCalled()) {
                        mafCalculator.add(g, p.isMale());
                    }
                    if (g.isHet() || g.isHomVar()) {
                        if (!g.getAlleles().contains(altAllele))
                            continue;
                        genotyped = true;
                        break;
                    }
                }
                if (!genotyped)
                    continue;
                final Interval interval = new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd());
                final List<KnownGene> genes = this.overlap(interval);
                if (genes.isEmpty())
                    continue;
                for (final KnownGene kg : genes) {
                    final TranscriptInfo trInfo = new TranscriptInfo();
                    trInfo.contig = kg.getContig();
                    trInfo.txStart = kg.getTxStart();
                    trInfo.txEnd = kg.getTxEnd();
                    trInfo.transcriptName = kg.getName();
                    trInfo.strand = (byte) (kg.isPositiveStrand() ? '+' : '-');
                    trInfo.exonCount = kg.getExonCount();
                    trInfo.transcriptLength = kg.getTranscriptLength();
                    trInfo.ctxStart = ctx.getStart();
                    trInfo.ref = ctx.getReference();
                    trInfo.alt = altAllele;
                    trInfo.maf = mafCalculator.getMaf();
                    trInfo.genotypes = new byte[individuals.size()];
                    int idx = 0;
                    for (final Pedigree.Person individual : individuals) {
                        final Genotype genotype = ctx.getGenotype(individual.getId());
                        final byte b;
                        if (genotype.isHomRef()) {
                            b = 0;
                        } else if (genotype.isHomVar() && genotype.getAlleles().contains(altAllele)) {
                            b = 2;
                        } else if (genotype.isHet() && genotype.getAlleles().contains(altAllele) && genotype.getAlleles().contains(ctx.getReference())) {
                            b = 1;
                        } else /* we treat 0/2 has hom-ref */
                        if (genotype.isHet() && !genotype.getAlleles().contains(altAllele) && genotype.getAlleles().contains(ctx.getReference())) {
                            LOG.warn("Treating " + genotype + " as hom-ref (0) alt=" + altAllele);
                            b = 0;
                        } else /* we treat 2/2 has hom-ref */
                        if (genotype.isHomVar() && !genotype.getAlleles().contains(altAllele)) {
                            LOG.warn("Treating " + genotype + " as hom-ref (0) alt=" + altAllele);
                            b = 0;
                        } else {
                            b = -9;
                        }
                        trInfo.genotypes[idx] = b;
                        ++idx;
                    }
                    KnownGene archetype = kg;
                    /* find gene archetype = longest overlapping */
                    for (final KnownGene kg2 : genes) {
                        if (kg2 == kg)
                            continue;
                        if (archetype.getStrand().equals(kg2.getStrand()) && archetype.getTranscriptLength() < kg2.getTranscriptLength()) {
                            archetype = kg2;
                        }
                    }
                    trInfo.archetypeName = archetype.getName();
                    trInfo.archetypeLength = archetype.getTranscriptLength();
                    boolean ctxWasFoundInExon = false;
                    final int ctxPos0 = ctx.getStart() - 1;
                    int indexInTranscript0 = 0;
                    for (final KnownGene.Exon exon : kg.getExons()) {
                        // variant in exon ?
                        if (!(exon.getStart() > (ctx.getEnd() - 1) || (ctx.getStart() - 1) >= exon.getEnd())) {
                            ctxWasFoundInExon = true;
                            indexInTranscript0 += (ctxPos0 - exon.getStart());
                            if (kg.isNegativeStrand()) {
                                indexInTranscript0 = (kg.getTranscriptLength() - 1) - indexInTranscript0;
                            }
                            trInfo.indexInTranscript0 = indexInTranscript0;
                            trInfo.overlapName = exon.getName();
                            sorting.add(trInfo);
                            break;
                        } else {
                            indexInTranscript0 += (exon.getEnd() - exon.getStart());
                        }
                    }
                    if (ctxWasFoundInExon) {
                        continue;
                    }
                    indexInTranscript0 = 0;
                    // search closest intron/exon junction
                    for (int ex = 0; ex + 1 < kg.getExonCount(); ++ex) {
                        final KnownGene.Exon exon1 = kg.getExon(ex);
                        indexInTranscript0 += (exon1.getEnd() - exon1.getStart());
                        final KnownGene.Exon exon2 = kg.getExon(ex + 1);
                        if (exon1.getEnd() <= ctxPos0 && ctxPos0 < exon2.getStart()) {
                            final int dist_to_exon1 = ctxPos0 - exon1.getEnd();
                            final int dist_to_exon2 = exon2.getStart() - ctxPos0;
                            if (dist_to_exon2 < dist_to_exon1) {
                                indexInTranscript0++;
                            }
                            if (kg.isNegativeStrand()) {
                                indexInTranscript0 = (kg.getTranscriptLength() - 1) - indexInTranscript0;
                            }
                            trInfo.indexInTranscript0 = indexInTranscript0;
                            trInfo.overlapName = exon1.getNextIntron().getName();
                            sorting.add(trInfo);
                            break;
                        }
                    }
                }
            // end loop over genes
            }
            // end while loop over variants
            progess.finish();
            sorting.doneAdding();
            LOG.info("done adding");
            iter2 = sorting.iterator();
            final EqualRangeIterator<TranscriptInfo> eqiter = new EqualRangeIterator<TranscriptInfo>(iter2, new Comparator<TranscriptInfo>() {

                @Override
                public int compare(final TranscriptInfo o1, final TranscriptInfo o2) {
                    int i = o1.contig.compareTo(o2.contig);
                    if (i != 0)
                        return i;
                    i = o1.transcriptName.compareTo(o2.transcriptName);
                    return i;
                }
            });
            while (eqiter.hasNext()) {
                final List<TranscriptInfo> list = eqiter.next();
                final TranscriptInfo front = list.get(0);
                pw.println("# BEGIN TRANSCRIPT " + front.transcriptName + " ##########################################");
                pw.println("transcript.chrom <- \"" + front.contig + "\"");
                pw.println("transcript.txStart0 <- " + front.txStart + "");
                pw.println("transcript.txEnd0 <- " + front.txEnd + "");
                pw.println("transcript.name <- \"" + front.transcriptName + "\"");
                pw.println("transcript.strand <- \"" + ((char) front.strand) + "\"");
                pw.println("transcript.length <- " + front.transcriptLength + "");
                pw.println("transcript.exonCount <- " + front.exonCount + "");
                pw.println("archetype.name <- \"" + front.archetypeName + "\"");
                pw.println("archetype.length <- " + front.archetypeLength + "");
                pw.print("variants <- data.frame(chrom=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print("\"" + v.contig + "\"");
                    first = false;
                }
                pw.print("),chromStart=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print(v.ctxStart);
                    first = false;
                }
                pw.print("),chromEnd=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print(v.ctxStart + v.ref.length() - 1);
                    first = false;
                }
                pw.print("),refAllele=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print("\"" + v.ref.getDisplayString() + "\"");
                    first = false;
                }
                pw.print("),altAllele=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print("\"" + v.alt.getDisplayString() + "\"");
                    first = false;
                }
                pw.print("),positionInTranscript1=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print(v.indexInTranscript0 + 1);
                    first = false;
                }
                pw.print("),maf=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print(v.maf);
                    first = false;
                }
                pw.print("),overlapName=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print("\"" + v.overlapName + "\"");
                    first = false;
                }
                pw.println("))");
                pw.println("# genotypes as a list. Should be a multiple of length(samples).");
                pw.println("# 0 is homref (0/0), 1 is het (0/1), 2 is homvar (1/1)");
                pw.println("# if the variant contains another ALT allele: (0/2) and (2/2) are considered 0 (homref)");
                pw.print("genotypes <- c(");
                first = true;
                for (final TranscriptInfo tr : list) {
                    for (byte g : tr.genotypes) {
                        if (!first)
                            pw.print(",");
                        first = false;
                        pw.print((int) g);
                    }
                }
                pw.println(")");
                pw.println("stopifnot(NROW(variants) * NROW(population) == length(genotypes) )");
                if (this.userDefinedFunName == null || this.userDefinedFunName.trim().isEmpty()) {
                    pw.println("## WARNING not user-defined R function was defined");
                } else {
                    pw.println("# consumme data with user-defined R function ");
                    pw.println(this.userDefinedFunName + "()");
                }
                pw.println("# END TRANSCRIPT " + front.transcriptName + " ##########################################");
            }
            // end while eqiter
            eqiter.close();
            iter2.close();
            iter2 = null;
            sorting.cleanup();
            sorting = null;
        }
    } finally {
        CloserUtil.close(iter2);
        if (sorting != null)
            sorting.cleanup();
    }
}

Also used : VariantContext(htsjdk.variant.variantcontext.VariantContext) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) TreeSet(java.util.TreeSet) VCFHeader(htsjdk.variant.vcf.VCFHeader) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) Genotype(htsjdk.variant.variantcontext.Genotype) IOException(java.io.IOException) Allele(htsjdk.variant.variantcontext.Allele) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) Interval(htsjdk.samtools.util.Interval)

Example 4 with KnownGene

use of com.github.lindenb.jvarkit.util.ucsc.KnownGene in project jvarkit by lindenb.

the class VCFPredictions method doVcfToVcf.

@Override
protected int doVcfToVcf(final String inputName, final VcfIterator r, VariantContextWriter w) {
    ReferenceContig genomicSequence = null;
    try {
        LOG.info("opening REF:" + this.referenceGenomeSource);
        this.referenceGenome = new ReferenceGenomeFactory().open(this.referenceGenomeSource);
        loadKnownGenesFromUri();
        final VCFHeader header = (VCFHeader) r.getHeader();
        final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(this.referenceGenome.getDictionary());
        contigNameConverter.setOnNotFound(OnNotFound.SKIP);
        final VCFHeader h2 = new VCFHeader(header);
        addMetaData(h2);
        switch(this.outputSyntax) {
            case Vep:
                {
                    h2.addMetaDataLine(new VCFInfoHeaderLine("CSQ", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Consequence type as predicted by VEP" + ". Format: Allele|Feature|Feature_type|Consequence|CDS_position|Protein_position|Amino_acids|Codons"));
                    break;
                }
            case SnpEff:
                {
                    h2.addMetaDataLine(new VCFInfoHeaderLine("ANN", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'"));
                    break;
                }
            default:
                {
                    final StringBuilder format = new StringBuilder();
                    for (FORMAT1 f : FORMAT1.values()) {
                        if (format.length() > 0)
                            format.append("|");
                        format.append(f.name());
                    }
                    h2.addMetaDataLine(new VCFInfoHeaderLine(TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Prediction from " + getClass().getSimpleName() + ". Format: " + format));
                    break;
                }
        }
        w.writeHeader(h2);
        final SequenceOntologyTree soTree = SequenceOntologyTree.getInstance();
        final SequenceOntologyTree.Term so_intron = soTree.getTermByAcn("SO:0001627");
        final SequenceOntologyTree.Term so_exon = soTree.getTermByAcn("SO:0001791");
        final SequenceOntologyTree.Term so_splice_donor = soTree.getTermByAcn("SO:0001575");
        final SequenceOntologyTree.Term so_splice_acceptor = soTree.getTermByAcn("SO:0001574");
        final SequenceOntologyTree.Term so_5_prime_UTR_variant = soTree.getTermByAcn("SO:0001623");
        final SequenceOntologyTree.Term so_3_prime_UTR_variant = soTree.getTermByAcn("SO:0001624");
        final SequenceOntologyTree.Term so_splicing_variant = soTree.getTermByAcn("SO:0001568");
        final SequenceOntologyTree.Term so_stop_lost = soTree.getTermByAcn("SO:0001578");
        final SequenceOntologyTree.Term so_stop_gained = soTree.getTermByAcn("SO:0001587");
        final SequenceOntologyTree.Term so_coding_synonymous = soTree.getTermByAcn("SO:0001819");
        final SequenceOntologyTree.Term so_coding_non_synonymous = soTree.getTermByAcn("SO:0001583");
        final SequenceOntologyTree.Term so_intergenic = soTree.getTermByAcn("SO:0001628");
        final SequenceOntologyTree.Term so_nc_transcript_variant = soTree.getTermByAcn("SO:0001619");
        final SequenceOntologyTree.Term so_non_coding_exon_variant = soTree.getTermByAcn("SO:0001792");
        final SequenceOntologyTree.Term _2KB_upstream_variant = soTree.getTermByAcn("SO:0001636");
        final SequenceOntologyTree.Term _5KB_upstream_variant = soTree.getTermByAcn("SO:0001635");
        final SequenceOntologyTree.Term _5KB_downstream_variant = soTree.getTermByAcn("SO:0001633");
        final SequenceOntologyTree.Term _500bp_downstream_variant = soTree.getTermByAcn("SO:0001634");
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
        while (r.hasNext()) {
            final VariantContext ctx = progress.watch(r.next());
            final String normalizedContig = contigNameConverter.apply(ctx.getContig());
            final List<KnownGene> genes = new ArrayList<>();
            if (!StringUtil.isBlank(normalizedContig)) {
                for (final List<KnownGene> l2 : this.knownGenes.getOverlapping(new Interval(normalizedContig, ctx.getStart(), // 1-based
                ctx.getEnd()))) {
                    genes.addAll(l2);
                }
            }
            final List<Annotation> ctx_annotations = new ArrayList<Annotation>();
            if (genes == null || genes.isEmpty()) {
                // intergenic
                Annotation a = new Annotation();
                a.seqont.add(so_intergenic);
                ctx_annotations.add(a);
            } else {
                if (genomicSequence == null || !genomicSequence.hasName(normalizedContig)) {
                    LOG.info("getting genomic Sequence for " + normalizedContig);
                    genomicSequence = this.referenceGenome.getContig(normalizedContig);
                    if (genomicSequence == null)
                        throw new JvarkitException.ContigNotFoundInDictionary(normalizedContig, this.referenceGenome.getDictionary());
                }
                for (final KnownGene gene : genes) {
                    final GeneticCode geneticCode = GeneticCode.getStandard();
                    for (final Allele alt2 : ctx.getAlternateAlleles()) {
                        if (alt2.isNoCall())
                            continue;
                        if (alt2.isSymbolic()) {
                            LOG.warn("symbolic allele are not handled... " + alt2.getDisplayString());
                            continue;
                        }
                        if (alt2.isReference())
                            continue;
                        final Annotation annotations = new Annotation();
                        annotations.kg = gene;
                        annotations.alt2 = alt2;
                        if (gene.isNonCoding()) {
                            annotations.seqont.add(so_nc_transcript_variant);
                            continue;
                        }
                        ctx_annotations.add(annotations);
                        StringBuilder wildRNA = null;
                        ProteinCharSequence wildProt = null;
                        ProteinCharSequence mutProt = null;
                        MutedSequence mutRNA = null;
                        int position_in_cds = -1;
                        final int position = ctx.getStart() - 1;
                        if (!String.valueOf(genomicSequence.charAt(position)).equalsIgnoreCase(ctx.getReference().getBaseString())) {
                            if (isSimpleBase(ctx.getReference())) {
                                LOG.warn("Warning REF!=GENOMIC SEQ!!! at " + position + "/" + ctx.getReference());
                            }
                            continue;
                        }
                        if (gene.isPositiveStrand()) {
                            if (position < gene.getTxStart() - 2000) {
                                annotations.seqont.add(_5KB_upstream_variant);
                            } else if (position < gene.getTxStart()) {
                                annotations.seqont.add(_2KB_upstream_variant);
                            } else if (position >= gene.getTxEnd() + 500) {
                                annotations.seqont.add(_5KB_downstream_variant);
                            } else if (position >= gene.getTxEnd()) {
                                annotations.seqont.add(_500bp_downstream_variant);
                            } else if (position < gene.getCdsStart()) {
                                // UTR5
                                annotations.seqont.add(so_5_prime_UTR_variant);
                            } else if (gene.getCdsEnd() <= position) {
                                annotations.seqont.add(so_3_prime_UTR_variant);
                            } else {
                                int exon_index = 0;
                                while (exon_index < gene.getExonCount()) {
                                    final KnownGene.Exon exon = gene.getExon(exon_index);
                                    for (int i = exon.getStart(); i < exon.getEnd(); ++i) {
                                        if (i == position) {
                                            annotations.exon_name = exon.getName();
                                            if (exon.isNonCoding()) {
                                                annotations.seqont.add(so_non_coding_exon_variant);
                                            }
                                        }
                                        if (i < gene.getTxStart())
                                            continue;
                                        if (i < gene.getCdsStart())
                                            continue;
                                        if (i >= gene.getCdsEnd())
                                            break;
                                        if (wildRNA == null) {
                                            wildRNA = new StringBuilder();
                                            mutRNA = new MutedSequence(wildRNA);
                                        }
                                        if (i == position) {
                                            annotations.seqont.add(so_exon);
                                            annotations.exon_name = exon.getName();
                                            position_in_cds = wildRNA.length();
                                            annotations.position_cds = position_in_cds;
                                            // in splicing ?
                                            if (exon.isSplicing(position)) {
                                                if (exon.isSplicingAcceptor(position)) {
                                                    // SPLICING_ACCEPTOR
                                                    annotations.seqont.add(so_splice_acceptor);
                                                } else if (exon.isSplicingDonor(position)) {
                                                    // SPLICING_DONOR
                                                    annotations.seqont.add(so_splice_donor);
                                                } else // ??
                                                {
                                                    annotations.seqont.add(so_splicing_variant);
                                                }
                                            }
                                        }
                                        wildRNA.append(genomicSequence.charAt(i));
                                        if (i == position && isSimpleBase(alt2) && isSimpleBase(ctx.getReference())) {
                                            mutRNA.put(position_in_cds, alt2.getBaseString().charAt(0));
                                        }
                                        if (wildRNA.length() % 3 == 0 && wildRNA.length() > 0 && wildProt == null) {
                                            wildProt = new ProteinCharSequence(geneticCode, wildRNA);
                                            mutProt = new ProteinCharSequence(geneticCode, mutRNA);
                                        }
                                    }
                                    final KnownGene.Intron intron = exon.getNextIntron();
                                    if (intron != null && intron.contains(position)) {
                                        annotations.intron_name = intron.getName();
                                        annotations.seqont.add(so_intron);
                                        if (intron.isSplicing(position)) {
                                            if (intron.isSplicingAcceptor(position)) {
                                                annotations.seqont.add(so_splice_acceptor);
                                            } else if (intron.isSplicingDonor(position)) {
                                                annotations.seqont.add(so_splice_donor);
                                            } else // ???
                                            {
                                                annotations.seqont.add(so_splicing_variant);
                                            }
                                        }
                                    }
                                    ++exon_index;
                                }
                            }
                        } else // reverse orientation
                        {
                            if (position >= gene.getTxEnd() + 2000) {
                                annotations.seqont.add(_5KB_upstream_variant);
                            } else if (position >= gene.getTxEnd()) {
                                annotations.seqont.add(_2KB_upstream_variant);
                            } else if (position < gene.getTxStart() - 500) {
                                annotations.seqont.add(_5KB_downstream_variant);
                            } else if (position < gene.getTxStart()) {
                                annotations.seqont.add(_500bp_downstream_variant);
                            } else if (position < gene.getCdsStart()) {
                                annotations.seqont.add(so_3_prime_UTR_variant);
                            } else if (gene.getCdsEnd() <= position) {
                                annotations.seqont.add(so_5_prime_UTR_variant);
                            } else {
                                int exon_index = gene.getExonCount() - 1;
                                while (exon_index >= 0) {
                                    final KnownGene.Exon exon = gene.getExon(exon_index);
                                    for (int i = exon.getEnd() - 1; i >= exon.getStart(); --i) {
                                        if (i == position) {
                                            annotations.exon_name = exon.getName();
                                            if (exon.isNonCoding()) {
                                                annotations.seqont.add(so_non_coding_exon_variant);
                                            }
                                        }
                                        if (i >= gene.getCdsEnd())
                                            continue;
                                        if (i < gene.getCdsStart())
                                            break;
                                        if (wildRNA == null) {
                                            wildRNA = new StringBuilder();
                                            mutRNA = new MutedSequence(wildRNA);
                                        }
                                        if (i == position) {
                                            annotations.seqont.add(so_exon);
                                            position_in_cds = wildRNA.length();
                                            annotations.position_cds = position_in_cds;
                                            // in splicing ?
                                            if (exon.isSplicing(position)) {
                                                if (exon.isSplicingAcceptor(position)) {
                                                    annotations.seqont.add(so_splice_acceptor);
                                                } else if (exon.isSplicingDonor(position)) {
                                                    annotations.seqont.add(so_splice_donor);
                                                } else // ?
                                                {
                                                    annotations.seqont.add(so_splicing_variant);
                                                }
                                            }
                                            if (isSimpleBase(alt2) && isSimpleBase(ctx.getReference())) {
                                                mutRNA.put(position_in_cds, AcidNucleics.complement(alt2.getBaseString().charAt(0)));
                                            }
                                        }
                                        wildRNA.append(AcidNucleics.complement(genomicSequence.charAt(i)));
                                        if (wildRNA.length() % 3 == 0 && wildRNA.length() > 0 && wildProt == null) {
                                            wildProt = new ProteinCharSequence(geneticCode, wildRNA);
                                            mutProt = new ProteinCharSequence(geneticCode, mutRNA);
                                        }
                                    }
                                    final KnownGene.Intron intron = exon.getPrevIntron();
                                    if (intron != null && intron.contains(position)) {
                                        annotations.intron_name = intron.getName();
                                        annotations.seqont.add(so_intron);
                                        if (intron.isSplicing(position)) {
                                            if (intron.isSplicingAcceptor(position)) {
                                                annotations.seqont.add(so_splice_acceptor);
                                            } else if (intron.isSplicingDonor(position)) {
                                                annotations.seqont.add(so_splice_donor);
                                            } else // ?
                                            {
                                                annotations.seqont.add(so_splicing_variant);
                                            }
                                        }
                                    }
                                    --exon_index;
                                }
                            }
                        }
                        if (isSimpleBase(alt2) && isSimpleBase(ctx.getReference()) && wildProt != null && mutProt != null && position_in_cds >= 0) {
                            final int pos_aa = position_in_cds / 3;
                            final int mod = position_in_cds % 3;
                            annotations.wildCodon = ("" + wildRNA.charAt(position_in_cds - mod + 0) + wildRNA.charAt(position_in_cds - mod + 1) + wildRNA.charAt(position_in_cds - mod + 2));
                            annotations.mutCodon = ("" + mutRNA.charAt(position_in_cds - mod + 0) + mutRNA.charAt(position_in_cds - mod + 1) + mutRNA.charAt(position_in_cds - mod + 2));
                            annotations.position_protein = (pos_aa + 1);
                            annotations.wildAA = String.valueOf(wildProt.charAt(pos_aa));
                            annotations.mutAA = (String.valueOf(mutProt.charAt(pos_aa)));
                            annotations.seqont.remove(so_exon);
                            if (isStop(wildProt.charAt(pos_aa)) && !isStop(mutProt.charAt(pos_aa))) {
                                annotations.seqont.add(so_stop_lost);
                            } else if (!isStop(wildProt.charAt(pos_aa)) && isStop(mutProt.charAt(pos_aa))) {
                                annotations.seqont.add(so_stop_gained);
                            } else if (wildProt.charAt(pos_aa) == mutProt.charAt(pos_aa)) {
                                annotations.seqont.add(so_coding_synonymous);
                            } else {
                                annotations.seqont.add(so_coding_non_synonymous);
                            }
                        }
                    }
                }
            }
            final Set<String> info = new HashSet<String>(ctx_annotations.size());
            for (final Annotation a : ctx_annotations) {
                info.add(a.toString());
            }
            final VariantContextBuilder vb = new VariantContextBuilder(ctx);
            final String thetag;
            switch(this.outputSyntax) {
                case Vep:
                    thetag = "CSQ";
                    break;
                case SnpEff:
                    thetag = "ANN";
                    break;
                default:
                    thetag = TAG;
                    break;
            }
            vb.attribute(thetag, info.toArray());
            w.add(vb.make());
        }
        return RETURN_OK;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(this.referenceGenome);
    }
}

Also used : ReferenceContig(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceContig) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFHeader(htsjdk.variant.vcf.VCFHeader) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) HashSet(java.util.HashSet) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) ReferenceGenomeFactory(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenomeFactory) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) IOException(java.io.IOException) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) SequenceOntologyTree(com.github.lindenb.jvarkit.util.so.SequenceOntologyTree) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) GeneticCode(com.github.lindenb.jvarkit.util.bio.GeneticCode) Interval(htsjdk.samtools.util.Interval)

Example 5 with KnownGene

use of com.github.lindenb.jvarkit.util.ucsc.KnownGene in project jvarkit by lindenb.

the class VCFPredictions method loadKnownGenesFromUri.

private void loadKnownGenesFromUri() throws IOException {
    BufferedReader in = null;
    try {
        if (this.referenceGenome.getDictionary() == null) {
            throw new JvarkitException.FastaDictionaryMissing(this.referenceGenomeSource);
        }
        int n_ignored = 0;
        int n_genes = 0;
        this.knownGenes = new IntervalTreeMap<>();
        LOG.info("loading genes");
        final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(this.referenceGenome.getDictionary());
        contigNameConverter.setOnNotFound(OnNotFound.SKIP);
        in = IOUtils.openURIForBufferedReading(this.kgURI);
        String line;
        final Pattern tab = Pattern.compile("[\t]");
        while ((line = in.readLine()) != null) {
            if (StringUtil.isBlank(line))
                continue;
            final String[] tokens = tab.split(line);
            final KnownGene g = new KnownGene(tokens);
            final String normalizedContig = contigNameConverter.apply(g.getContig());
            if (StringUtil.isBlank(normalizedContig)) {
                ++n_ignored;
                continue;
            }
            if (this.referenceGenome.getDictionary().getSequence(normalizedContig) == null) {
                ++n_ignored;
                continue;
            }
            // because we want to set
            final int extend_gene_search = 5000;
            // SO:5KB_upstream_variant
            final Interval interval = new Interval(normalizedContig, Math.max(1, g.getTxStart() + 1 - extend_gene_search), g.getTxEnd() + extend_gene_search);
            List<KnownGene> L = this.knownGenes.get(interval);
            if (L == null) {
                L = new ArrayList<>(2);
                this.knownGenes.put(interval, L);
            }
            ++n_genes;
            L.add(g);
        }
        in.close();
        in = null;
        LOG.info("genes:" + n_genes + " (ignored: " + n_ignored + ")");
    } finally {
        CloserUtil.close(in);
    }
}

Also used : Pattern(java.util.regex.Pattern) BufferedReader(java.io.BufferedReader) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Interval(htsjdk.samtools.util.Interval)

Aggregations

KnownGene (com.github.lindenb.jvarkit.util.ucsc.KnownGene)22 Interval (htsjdk.samtools.util.Interval)11 ArrayList (java.util.ArrayList)9 BufferedReader (java.io.BufferedReader)8 Pattern (java.util.regex.Pattern)8 IOException (java.io.IOException)7 VariantContext (htsjdk.variant.variantcontext.VariantContext)6 VCFHeader (htsjdk.variant.vcf.VCFHeader)5 File (java.io.File)5 HashSet (java.util.HashSet)5 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)4 List (java.util.List)4 Parameter (com.beust.jcommander.Parameter)3 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)3 GeneticCode (com.github.lindenb.jvarkit.util.bio.GeneticCode)3 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)3 Logger (com.github.lindenb.jvarkit.util.log.Logger)3 GenomicSequence (com.github.lindenb.jvarkit.util.picard.GenomicSequence)3 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)3 VcfIterator (com.github.lindenb.jvarkit.util.vcf.VcfIterator)3