Search in sources :

Example 1 with GOOntology

use of com.github.lindenb.jvarkit.go.GOOntology in project jvarkit by lindenb.

the class GoGeneReporter method doWork.

@Override
public int doWork(final List<String> args) {
    try {
        final String input = oneFileOrNull(args);
        final List<List<String>> table = new ArrayList<>();
        try (BufferedReader br = super.openBufferedReader(input)) {
            String line;
            while ((line = br.readLine()) != null) {
                final List<String> tokens = CharSplitter.TAB.splitAsStringList(line);
                if (tokens.size() < this.geneColumnName1) {
                    throw new JvarkitException.TokenErrors("expected " + this.geneColumnName1 + " columns", tokens);
                }
                table.add(tokens);
            }
        }
        if (table.isEmpty()) {
            LOG.info("No data. Bye");
            return 0;
        }
        final Set<String> geneNames = table.stream().skip(first_line_is_header ? 1L : 0L).map(T -> T.get(geneColumnName1 - 1)).collect(Collectors.toSet());
        final Map<String, Set<GOOntology.Term>> gene2go = new HashMap<>(geneNames.size());
        final GOOntology mainGoTree = new GOParser().setDebug(false).parseOBO(this.goURI);
        final Set<GOOntology.Term> limitToTerms;
        if (StringUtils.isBlank(this.limitTermStr)) {
            limitToTerms = null;
        } else {
            limitToTerms = Arrays.stream(this.limitTermStr.split("[ ,\t\n]+")).map(S -> {
                GOOntology.Term term = mainGoTree.getTermByAccession(S);
                if (term == null)
                    term = mainGoTree.getTermByName(S);
                if (term == null)
                    throw new IllegalArgumentException("Cannot find GO term : " + S);
                return term;
            }).collect(Collectors.toSet());
        }
        try (GOAFileIterator goain = GOAFileIterator.newInstance(this.goaUri)) {
            while (goain.hasNext()) {
                final GOAFileIterator.GafRecord rec = goain.next();
                if (rec.getQualifiers().contains("NOT"))
                    continue;
                if (!geneNames.contains(rec.getObjectSymbol()))
                    continue;
                final GOOntology.Term term = mainGoTree.getTermByAccession(rec.getGoId());
                if (term == null) {
                    LOG.warn("Cannot find GO term " + rec.getGoId());
                    continue;
                }
                Set<GOOntology.Term> acns = gene2go.get(rec.getObjectSymbol());
                if (acns == null) {
                    acns = new HashSet<>();
                    gene2go.put(rec.getObjectSymbol(), acns);
                }
                acns.add(term);
            }
        }
        LOG.warn("No GO term was found associated to the following genes:" + geneNames.stream().filter(G -> !gene2go.containsKey(G)).collect(Collectors.joining(" ")));
        Reporter reporter = new TextReporter(super.openPathOrStdoutAsPrintWriter(this.outputFile));
        reporter.beginDoc();
        for (final GOOntology.Term term : mainGoTree.getTerms()) {
            Objects.requireNonNull(term);
            if (limitToTerms != null && limitToTerms.stream().noneMatch(T -> term.isDescendantOf(T)))
                continue;
            final Set<String> displayGenes = gene2go.entrySet().stream().filter(KV -> KV.getValue().stream().anyMatch(TERM -> TERM.isDescendantOf(term))).map(KV -> KV.getKey()).collect(Collectors.toSet());
            if (displayGenes.isEmpty())
                continue;
            reporter.report(term, displayGenes, table);
        }
        reporter.endDoc();
        reporter.close();
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    }
}
Also used : Arrays(java.util.Arrays) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) Parameter(com.beust.jcommander.Parameter) HashMap(java.util.HashMap) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) GOAFileIterator(com.github.lindenb.jvarkit.goa.GOAFileIterator) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) Map(java.util.Map) XMLStreamException(javax.xml.stream.XMLStreamException) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) XMLStreamWriter(javax.xml.stream.XMLStreamWriter) GOOntology(com.github.lindenb.jvarkit.go.GOOntology) Path(java.nio.file.Path) PrintWriter(java.io.PrintWriter) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) GOParser(com.github.lindenb.jvarkit.go.GOParser) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) Objects(java.util.Objects) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) BufferedReader(java.io.BufferedReader) GOAFileIterator(com.github.lindenb.jvarkit.goa.GOAFileIterator) HashSet(java.util.HashSet) Set(java.util.Set) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) GOParser(com.github.lindenb.jvarkit.go.GOParser) BufferedReader(java.io.BufferedReader) ArrayList(java.util.ArrayList) List(java.util.List) GOOntology(com.github.lindenb.jvarkit.go.GOOntology)

Example 2 with GOOntology

use of com.github.lindenb.jvarkit.go.GOOntology in project jvarkit by lindenb.

the class VcfBurdenGoEnrichment method doWork.

@Override
public int doWork(final List<String> args) {
    if (StringUtil.isBlank(this.goURI)) {
        LOG.error("Undefined GOs uri.");
        return -1;
    }
    if (this.geneFile == null || !this.geneFile.exists()) {
        LOG.error("Undefined gene file option.");
        return -1;
    }
    try {
        final GOOntology gotree = new GOParser().parseOBO(this.goURI);
        List<GOOntology.Term> terms = new ArrayList<>(gotree.getTerms());
        final Map<GOOntology.Term, Node> term2node = new HashMap<>();
        // build the node TREE
        while (!terms.isEmpty()) {
            int i = 0;
            while (i < terms.size()) {
                final GOOntology.Term t = terms.get(i);
                if (!t.hasRelations()) {
                    term2node.put(t, new Node(t));
                    terms.remove(i);
                } else if (t.getRelations().stream().allMatch(L -> term2node.containsKey(L.getTo()))) {
                    final Node n = new Node(t);
                    n.parents.addAll(t.getRelations().stream().map(L -> term2node.get(L.getTo())).collect(Collectors.toSet()));
                    term2node.put(t, n);
                    terms.remove(i);
                } else {
                    i++;
                }
            }
        }
        terms = null;
        final Set<String> unknownAcn = new HashSet<>();
        final Map<String, Set<Node>> gene2node = new HashMap<>();
        final BufferedReader r = IOUtils.openFileForBufferedReading(this.geneFile);
        String line;
        while ((line = r.readLine()) != null) {
            if (line.isEmpty() || line.startsWith("#"))
                continue;
            final int t = line.indexOf('\t');
            if (t == -1) {
                r.close();
                LOG.error("tab missing in " + line + " of " + this.geneFile);
                return -1;
            }
            final String gene = line.substring(0, t).trim();
            if (StringUtil.isBlank(gene)) {
                r.close();
                LOG.error("Emtpy gene in " + line);
                return -1;
            }
            // using getTermByName because found sysnonym in GOA
            final String termAcn = line.substring(t + 1).trim();
            if (unknownAcn.contains(termAcn))
                continue;
            final GOOntology.Term term = gotree.getTermByName(termAcn);
            if (term == null && !unknownAcn.contains(termAcn)) {
                unknownAcn.add(termAcn);
                LOG.warning("Don't know this GO term in " + line + " of " + this.geneFile + ". Could be obsolete, synonym, go specific division. Skipping.");
                continue;
            }
            final Node node = term2node.get(term);
            if (node == null) {
                r.close();
                LOG.error("Don't know this node in " + line + " of " + this.geneFile);
                return -1;
            }
            Set<Node> nodes = gene2node.get(gene);
            if (nodes == null) {
                nodes = new HashSet<>();
                gene2node.put(gene, nodes);
            }
            node.numGenes++;
            nodes.add(node);
        }
        ;
        // clean up
        unknownAcn.clear();
        r.close();
        final VCFIterator iter = openVCFIterator(oneFileOrNull(args));
        final VCFHeader header = iter.getHeader();
        final VepPredictionParser vepParser = new VepPredictionParserFactory(header).get();
        final AnnPredictionParser annParser = new AnnPredictionParserFactory(header).get();
        final Set<Pedigree.Person> persons;
        if (this.pedFile != null) {
            final Pedigree pedigree = Pedigree.newParser().parse(this.pedFile);
            persons = new Pedigree.CaseControlExtractor().extract(header, pedigree);
        } else {
            persons = new Pedigree.CaseControlExtractor().extract(header);
        }
        final Set<Pedigree.Person> affected = persons.stream().filter(P -> P.isAffected()).collect(Collectors.toSet());
        final Set<Pedigree.Person> unaffected = persons.stream().filter(P -> P.isUnaffected()).collect(Collectors.toSet());
        if (affected.isEmpty()) {
            LOG.error("No Affected individual");
            return -1;
        }
        if (unaffected.isEmpty()) {
            LOG.error("No unaffected individual");
            return -1;
        }
        final List<String> lookColumns = Arrays.asList("CCDS", "Feature", "ENSP", "Gene", "HGNC", "HGNC_ID", "SYMBOL", "RefSeq");
        final Predicate<Genotype> isWildGenotype = G -> {
            if (G == null)
                return false;
            return G.isHomRef();
        };
        final Predicate<Genotype> isAltGenotype = G -> {
            if (G == null)
                return false;
            return G.isCalled() && !G.isHomRef();
        };
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
        while (iter.hasNext()) {
            final VariantContext ctx = progress.watch(iter.next());
            if (!this.variantFilter.test(ctx))
                continue;
            final Set<String> genes = new HashSet<>();
            for (final String predStr : ctx.getAttributeAsList(vepParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
                final VepPredictionParser.VepPrediction pred = vepParser.parseOnePrediction(ctx, predStr);
                for (final String col : lookColumns) {
                    final String token = pred.getByCol(col);
                    if (!StringUtil.isBlank(token)) {
                        genes.add(token);
                    }
                }
            }
            for (final String predStr : ctx.getAttributeAsList(annParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
                final AnnPredictionParser.AnnPrediction pred = annParser.parseOnePrediction(predStr);
                final String token = pred.getGeneName();
                if (!StringUtil.isBlank(token)) {
                    genes.add(token);
                }
            }
            if (genes.isEmpty())
                continue;
            final Set<Node> nodes = genes.stream().filter(G -> gene2node.containsKey(G)).flatMap(G -> gene2node.get(G).stream()).collect(Collectors.toSet());
            if (nodes.isEmpty())
                continue;
            final long unaffected_alt = unaffected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isAltGenotype).count();
            final long affected_alt = affected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isAltGenotype).count();
            /* no informative */
            if (unaffected_alt + affected_alt == 0L) {
                continue;
            }
            final long affected_ref = affected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isWildGenotype).count();
            final long unaffected_ref = unaffected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isWildGenotype).count();
            nodes.stream().forEach(N -> N.resetVisitedFlag());
            nodes.stream().forEach(N -> N.visit(unaffected_ref, unaffected_alt, affected_ref, affected_alt));
        }
        iter.close();
        progress.finish();
        LOG.info("Calculating Fisher and dumping.. please wait");
        final PrintWriter pw = super.openFileOrStdoutAsPrintWriter(this.outputFile);
        pw.println("#go_term\tfisher\tname\tgo_term_depth\tcount_genes_in_this_node" + "\tunaffected_ref_gt" + "\tunaffected_alt_gt" + "\taffected_ref_gt" + "\taffected_alt_gt");
        term2node.values().stream().filter(N -> this.show_never_seeen_term || N.sum() > 0L).sorted((n1, n2) -> Double.compare(n1.fisher(), n2.fisher())).forEach(N -> {
            pw.print(N.goTerm.getAcn());
            pw.print('\t');
            pw.print(N.fisher());
            pw.print("\t");
            pw.print(N.goTerm.getName().replaceAll("[ \',\\-]+", "_"));
            pw.print("\t");
            pw.print(N.goTerm.getMinDepth());
            pw.print('\t');
            pw.print(N.numGenes);
            pw.print('\t');
            pw.print(N.unaffected_ref);
            pw.print('\t');
            pw.print(N.unaffected_alt);
            pw.print('\t');
            pw.print(N.affected_ref);
            pw.print('\t');
            pw.print(N.affected_alt);
            pw.println();
        });
        pw.flush();
        pw.close();
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    }
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) Arrays(java.util.Arrays) JexlVariantPredicate(com.github.lindenb.jvarkit.util.vcf.JexlVariantPredicate) VCFIterator(htsjdk.variant.vcf.VCFIterator) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) AnnPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BiPredicate(java.util.function.BiPredicate) StringUtil(htsjdk.samtools.util.StringUtil) FisherExactTest(com.github.lindenb.jvarkit.math.stats.FisherExactTest) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) Map(java.util.Map) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) GOOntology(com.github.lindenb.jvarkit.go.GOOntology) VepPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory) PrintWriter(java.io.PrintWriter) JexlGenotypePredicate(com.github.lindenb.jvarkit.util.vcf.JexlGenotypePredicate) Predicate(java.util.function.Predicate) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) GOParser(com.github.lindenb.jvarkit.go.GOParser) File(java.io.File) List(java.util.List) AnnPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParserFactory) VariantContext(htsjdk.variant.variantcontext.VariantContext) BufferedReader(java.io.BufferedReader) AnnPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser) HashSet(java.util.HashSet) Set(java.util.Set) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) AnnPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParserFactory) VCFHeader(htsjdk.variant.vcf.VCFHeader) VCFIterator(htsjdk.variant.vcf.VCFIterator) HashSet(java.util.HashSet) PrintWriter(java.io.PrintWriter) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) GOParser(com.github.lindenb.jvarkit.go.GOParser) Genotype(htsjdk.variant.variantcontext.Genotype) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) BufferedReader(java.io.BufferedReader) VepPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory) GOOntology(com.github.lindenb.jvarkit.go.GOOntology)

Aggregations

Parameter (com.beust.jcommander.Parameter)2 GOOntology (com.github.lindenb.jvarkit.go.GOOntology)2 GOParser (com.github.lindenb.jvarkit.go.GOParser)2 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)2 Logger (com.github.lindenb.jvarkit.util.log.Logger)2 BufferedReader (java.io.BufferedReader)2 PrintWriter (java.io.PrintWriter)2 ArrayList (java.util.ArrayList)2 Arrays (java.util.Arrays)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 List (java.util.List)2 Map (java.util.Map)2 Set (java.util.Set)2 Collectors (java.util.stream.Collectors)2 ParametersDelegate (com.beust.jcommander.ParametersDelegate)1 GOAFileIterator (com.github.lindenb.jvarkit.goa.GOAFileIterator)1 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)1 CharSplitter (com.github.lindenb.jvarkit.lang.CharSplitter)1 JvarkitException (com.github.lindenb.jvarkit.lang.JvarkitException)1