use of com.github.lindenb.jvarkit.go.GOOntology in project jvarkit by lindenb.
the class GoGeneReporter method doWork.
@Override
public int doWork(final List<String> args) {
try {
final String input = oneFileOrNull(args);
final List<List<String>> table = new ArrayList<>();
try (BufferedReader br = super.openBufferedReader(input)) {
String line;
while ((line = br.readLine()) != null) {
final List<String> tokens = CharSplitter.TAB.splitAsStringList(line);
if (tokens.size() < this.geneColumnName1) {
throw new JvarkitException.TokenErrors("expected " + this.geneColumnName1 + " columns", tokens);
}
table.add(tokens);
}
}
if (table.isEmpty()) {
LOG.info("No data. Bye");
return 0;
}
final Set<String> geneNames = table.stream().skip(first_line_is_header ? 1L : 0L).map(T -> T.get(geneColumnName1 - 1)).collect(Collectors.toSet());
final Map<String, Set<GOOntology.Term>> gene2go = new HashMap<>(geneNames.size());
final GOOntology mainGoTree = new GOParser().setDebug(false).parseOBO(this.goURI);
final Set<GOOntology.Term> limitToTerms;
if (StringUtils.isBlank(this.limitTermStr)) {
limitToTerms = null;
} else {
limitToTerms = Arrays.stream(this.limitTermStr.split("[ ,\t\n]+")).map(S -> {
GOOntology.Term term = mainGoTree.getTermByAccession(S);
if (term == null)
term = mainGoTree.getTermByName(S);
if (term == null)
throw new IllegalArgumentException("Cannot find GO term : " + S);
return term;
}).collect(Collectors.toSet());
}
try (GOAFileIterator goain = GOAFileIterator.newInstance(this.goaUri)) {
while (goain.hasNext()) {
final GOAFileIterator.GafRecord rec = goain.next();
if (rec.getQualifiers().contains("NOT"))
continue;
if (!geneNames.contains(rec.getObjectSymbol()))
continue;
final GOOntology.Term term = mainGoTree.getTermByAccession(rec.getGoId());
if (term == null) {
LOG.warn("Cannot find GO term " + rec.getGoId());
continue;
}
Set<GOOntology.Term> acns = gene2go.get(rec.getObjectSymbol());
if (acns == null) {
acns = new HashSet<>();
gene2go.put(rec.getObjectSymbol(), acns);
}
acns.add(term);
}
}
LOG.warn("No GO term was found associated to the following genes:" + geneNames.stream().filter(G -> !gene2go.containsKey(G)).collect(Collectors.joining(" ")));
Reporter reporter = new TextReporter(super.openPathOrStdoutAsPrintWriter(this.outputFile));
reporter.beginDoc();
for (final GOOntology.Term term : mainGoTree.getTerms()) {
Objects.requireNonNull(term);
if (limitToTerms != null && limitToTerms.stream().noneMatch(T -> term.isDescendantOf(T)))
continue;
final Set<String> displayGenes = gene2go.entrySet().stream().filter(KV -> KV.getValue().stream().anyMatch(TERM -> TERM.isDescendantOf(term))).map(KV -> KV.getKey()).collect(Collectors.toSet());
if (displayGenes.isEmpty())
continue;
reporter.report(term, displayGenes, table);
}
reporter.endDoc();
reporter.close();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
}
}
use of com.github.lindenb.jvarkit.go.GOOntology in project jvarkit by lindenb.
the class VcfBurdenGoEnrichment method doWork.
@Override
public int doWork(final List<String> args) {
if (StringUtil.isBlank(this.goURI)) {
LOG.error("Undefined GOs uri.");
return -1;
}
if (this.geneFile == null || !this.geneFile.exists()) {
LOG.error("Undefined gene file option.");
return -1;
}
try {
final GOOntology gotree = new GOParser().parseOBO(this.goURI);
List<GOOntology.Term> terms = new ArrayList<>(gotree.getTerms());
final Map<GOOntology.Term, Node> term2node = new HashMap<>();
// build the node TREE
while (!terms.isEmpty()) {
int i = 0;
while (i < terms.size()) {
final GOOntology.Term t = terms.get(i);
if (!t.hasRelations()) {
term2node.put(t, new Node(t));
terms.remove(i);
} else if (t.getRelations().stream().allMatch(L -> term2node.containsKey(L.getTo()))) {
final Node n = new Node(t);
n.parents.addAll(t.getRelations().stream().map(L -> term2node.get(L.getTo())).collect(Collectors.toSet()));
term2node.put(t, n);
terms.remove(i);
} else {
i++;
}
}
}
terms = null;
final Set<String> unknownAcn = new HashSet<>();
final Map<String, Set<Node>> gene2node = new HashMap<>();
final BufferedReader r = IOUtils.openFileForBufferedReading(this.geneFile);
String line;
while ((line = r.readLine()) != null) {
if (line.isEmpty() || line.startsWith("#"))
continue;
final int t = line.indexOf('\t');
if (t == -1) {
r.close();
LOG.error("tab missing in " + line + " of " + this.geneFile);
return -1;
}
final String gene = line.substring(0, t).trim();
if (StringUtil.isBlank(gene)) {
r.close();
LOG.error("Emtpy gene in " + line);
return -1;
}
// using getTermByName because found sysnonym in GOA
final String termAcn = line.substring(t + 1).trim();
if (unknownAcn.contains(termAcn))
continue;
final GOOntology.Term term = gotree.getTermByName(termAcn);
if (term == null && !unknownAcn.contains(termAcn)) {
unknownAcn.add(termAcn);
LOG.warning("Don't know this GO term in " + line + " of " + this.geneFile + ". Could be obsolete, synonym, go specific division. Skipping.");
continue;
}
final Node node = term2node.get(term);
if (node == null) {
r.close();
LOG.error("Don't know this node in " + line + " of " + this.geneFile);
return -1;
}
Set<Node> nodes = gene2node.get(gene);
if (nodes == null) {
nodes = new HashSet<>();
gene2node.put(gene, nodes);
}
node.numGenes++;
nodes.add(node);
}
;
// clean up
unknownAcn.clear();
r.close();
final VCFIterator iter = openVCFIterator(oneFileOrNull(args));
final VCFHeader header = iter.getHeader();
final VepPredictionParser vepParser = new VepPredictionParserFactory(header).get();
final AnnPredictionParser annParser = new AnnPredictionParserFactory(header).get();
final Set<Pedigree.Person> persons;
if (this.pedFile != null) {
final Pedigree pedigree = Pedigree.newParser().parse(this.pedFile);
persons = new Pedigree.CaseControlExtractor().extract(header, pedigree);
} else {
persons = new Pedigree.CaseControlExtractor().extract(header);
}
final Set<Pedigree.Person> affected = persons.stream().filter(P -> P.isAffected()).collect(Collectors.toSet());
final Set<Pedigree.Person> unaffected = persons.stream().filter(P -> P.isUnaffected()).collect(Collectors.toSet());
if (affected.isEmpty()) {
LOG.error("No Affected individual");
return -1;
}
if (unaffected.isEmpty()) {
LOG.error("No unaffected individual");
return -1;
}
final List<String> lookColumns = Arrays.asList("CCDS", "Feature", "ENSP", "Gene", "HGNC", "HGNC_ID", "SYMBOL", "RefSeq");
final Predicate<Genotype> isWildGenotype = G -> {
if (G == null)
return false;
return G.isHomRef();
};
final Predicate<Genotype> isAltGenotype = G -> {
if (G == null)
return false;
return G.isCalled() && !G.isHomRef();
};
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
while (iter.hasNext()) {
final VariantContext ctx = progress.watch(iter.next());
if (!this.variantFilter.test(ctx))
continue;
final Set<String> genes = new HashSet<>();
for (final String predStr : ctx.getAttributeAsList(vepParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
final VepPredictionParser.VepPrediction pred = vepParser.parseOnePrediction(ctx, predStr);
for (final String col : lookColumns) {
final String token = pred.getByCol(col);
if (!StringUtil.isBlank(token)) {
genes.add(token);
}
}
}
for (final String predStr : ctx.getAttributeAsList(annParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
final AnnPredictionParser.AnnPrediction pred = annParser.parseOnePrediction(predStr);
final String token = pred.getGeneName();
if (!StringUtil.isBlank(token)) {
genes.add(token);
}
}
if (genes.isEmpty())
continue;
final Set<Node> nodes = genes.stream().filter(G -> gene2node.containsKey(G)).flatMap(G -> gene2node.get(G).stream()).collect(Collectors.toSet());
if (nodes.isEmpty())
continue;
final long unaffected_alt = unaffected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isAltGenotype).count();
final long affected_alt = affected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isAltGenotype).count();
/* no informative */
if (unaffected_alt + affected_alt == 0L) {
continue;
}
final long affected_ref = affected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isWildGenotype).count();
final long unaffected_ref = unaffected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isWildGenotype).count();
nodes.stream().forEach(N -> N.resetVisitedFlag());
nodes.stream().forEach(N -> N.visit(unaffected_ref, unaffected_alt, affected_ref, affected_alt));
}
iter.close();
progress.finish();
LOG.info("Calculating Fisher and dumping.. please wait");
final PrintWriter pw = super.openFileOrStdoutAsPrintWriter(this.outputFile);
pw.println("#go_term\tfisher\tname\tgo_term_depth\tcount_genes_in_this_node" + "\tunaffected_ref_gt" + "\tunaffected_alt_gt" + "\taffected_ref_gt" + "\taffected_alt_gt");
term2node.values().stream().filter(N -> this.show_never_seeen_term || N.sum() > 0L).sorted((n1, n2) -> Double.compare(n1.fisher(), n2.fisher())).forEach(N -> {
pw.print(N.goTerm.getAcn());
pw.print('\t');
pw.print(N.fisher());
pw.print("\t");
pw.print(N.goTerm.getName().replaceAll("[ \',\\-]+", "_"));
pw.print("\t");
pw.print(N.goTerm.getMinDepth());
pw.print('\t');
pw.print(N.numGenes);
pw.print('\t');
pw.print(N.unaffected_ref);
pw.print('\t');
pw.print(N.unaffected_alt);
pw.print('\t');
pw.print(N.affected_ref);
pw.print('\t');
pw.print(N.affected_alt);
pw.println();
});
pw.flush();
pw.close();
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
}
}
Aggregations