use of com.github.lindenb.jvarkit.go.GOParser in project jvarkit by lindenb.
the class GoGeneReporter method doWork.
@Override
public int doWork(final List<String> args) {
try {
final String input = oneFileOrNull(args);
final List<List<String>> table = new ArrayList<>();
try (BufferedReader br = super.openBufferedReader(input)) {
String line;
while ((line = br.readLine()) != null) {
final List<String> tokens = CharSplitter.TAB.splitAsStringList(line);
if (tokens.size() < this.geneColumnName1) {
throw new JvarkitException.TokenErrors("expected " + this.geneColumnName1 + " columns", tokens);
}
table.add(tokens);
}
}
if (table.isEmpty()) {
LOG.info("No data. Bye");
return 0;
}
final Set<String> geneNames = table.stream().skip(first_line_is_header ? 1L : 0L).map(T -> T.get(geneColumnName1 - 1)).collect(Collectors.toSet());
final Map<String, Set<GOOntology.Term>> gene2go = new HashMap<>(geneNames.size());
final GOOntology mainGoTree = new GOParser().setDebug(false).parseOBO(this.goURI);
final Set<GOOntology.Term> limitToTerms;
if (StringUtils.isBlank(this.limitTermStr)) {
limitToTerms = null;
} else {
limitToTerms = Arrays.stream(this.limitTermStr.split("[ ,\t\n]+")).map(S -> {
GOOntology.Term term = mainGoTree.getTermByAccession(S);
if (term == null)
term = mainGoTree.getTermByName(S);
if (term == null)
throw new IllegalArgumentException("Cannot find GO term : " + S);
return term;
}).collect(Collectors.toSet());
}
try (GOAFileIterator goain = GOAFileIterator.newInstance(this.goaUri)) {
while (goain.hasNext()) {
final GOAFileIterator.GafRecord rec = goain.next();
if (rec.getQualifiers().contains("NOT"))
continue;
if (!geneNames.contains(rec.getObjectSymbol()))
continue;
final GOOntology.Term term = mainGoTree.getTermByAccession(rec.getGoId());
if (term == null) {
LOG.warn("Cannot find GO term " + rec.getGoId());
continue;
}
Set<GOOntology.Term> acns = gene2go.get(rec.getObjectSymbol());
if (acns == null) {
acns = new HashSet<>();
gene2go.put(rec.getObjectSymbol(), acns);
}
acns.add(term);
}
}
LOG.warn("No GO term was found associated to the following genes:" + geneNames.stream().filter(G -> !gene2go.containsKey(G)).collect(Collectors.joining(" ")));
Reporter reporter = new TextReporter(super.openPathOrStdoutAsPrintWriter(this.outputFile));
reporter.beginDoc();
for (final GOOntology.Term term : mainGoTree.getTerms()) {
Objects.requireNonNull(term);
if (limitToTerms != null && limitToTerms.stream().noneMatch(T -> term.isDescendantOf(T)))
continue;
final Set<String> displayGenes = gene2go.entrySet().stream().filter(KV -> KV.getValue().stream().anyMatch(TERM -> TERM.isDescendantOf(term))).map(KV -> KV.getKey()).collect(Collectors.toSet());
if (displayGenes.isEmpty())
continue;
reporter.report(term, displayGenes, table);
}
reporter.endDoc();
reporter.close();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
}
}
use of com.github.lindenb.jvarkit.go.GOParser in project jvarkit by lindenb.
the class VcfBurdenGoEnrichment method doWork.
@Override
public int doWork(final List<String> args) {
if (StringUtil.isBlank(this.goURI)) {
LOG.error("Undefined GOs uri.");
return -1;
}
if (this.geneFile == null || !this.geneFile.exists()) {
LOG.error("Undefined gene file option.");
return -1;
}
try {
final GOOntology gotree = new GOParser().parseOBO(this.goURI);
List<GOOntology.Term> terms = new ArrayList<>(gotree.getTerms());
final Map<GOOntology.Term, Node> term2node = new HashMap<>();
// build the node TREE
while (!terms.isEmpty()) {
int i = 0;
while (i < terms.size()) {
final GOOntology.Term t = terms.get(i);
if (!t.hasRelations()) {
term2node.put(t, new Node(t));
terms.remove(i);
} else if (t.getRelations().stream().allMatch(L -> term2node.containsKey(L.getTo()))) {
final Node n = new Node(t);
n.parents.addAll(t.getRelations().stream().map(L -> term2node.get(L.getTo())).collect(Collectors.toSet()));
term2node.put(t, n);
terms.remove(i);
} else {
i++;
}
}
}
terms = null;
final Set<String> unknownAcn = new HashSet<>();
final Map<String, Set<Node>> gene2node = new HashMap<>();
final BufferedReader r = IOUtils.openFileForBufferedReading(this.geneFile);
String line;
while ((line = r.readLine()) != null) {
if (line.isEmpty() || line.startsWith("#"))
continue;
final int t = line.indexOf('\t');
if (t == -1) {
r.close();
LOG.error("tab missing in " + line + " of " + this.geneFile);
return -1;
}
final String gene = line.substring(0, t).trim();
if (StringUtil.isBlank(gene)) {
r.close();
LOG.error("Emtpy gene in " + line);
return -1;
}
// using getTermByName because found sysnonym in GOA
final String termAcn = line.substring(t + 1).trim();
if (unknownAcn.contains(termAcn))
continue;
final GOOntology.Term term = gotree.getTermByName(termAcn);
if (term == null && !unknownAcn.contains(termAcn)) {
unknownAcn.add(termAcn);
LOG.warning("Don't know this GO term in " + line + " of " + this.geneFile + ". Could be obsolete, synonym, go specific division. Skipping.");
continue;
}
final Node node = term2node.get(term);
if (node == null) {
r.close();
LOG.error("Don't know this node in " + line + " of " + this.geneFile);
return -1;
}
Set<Node> nodes = gene2node.get(gene);
if (nodes == null) {
nodes = new HashSet<>();
gene2node.put(gene, nodes);
}
node.numGenes++;
nodes.add(node);
}
;
// clean up
unknownAcn.clear();
r.close();
final VCFIterator iter = openVCFIterator(oneFileOrNull(args));
final VCFHeader header = iter.getHeader();
final VepPredictionParser vepParser = new VepPredictionParserFactory(header).get();
final AnnPredictionParser annParser = new AnnPredictionParserFactory(header).get();
final Set<Pedigree.Person> persons;
if (this.pedFile != null) {
final Pedigree pedigree = Pedigree.newParser().parse(this.pedFile);
persons = new Pedigree.CaseControlExtractor().extract(header, pedigree);
} else {
persons = new Pedigree.CaseControlExtractor().extract(header);
}
final Set<Pedigree.Person> affected = persons.stream().filter(P -> P.isAffected()).collect(Collectors.toSet());
final Set<Pedigree.Person> unaffected = persons.stream().filter(P -> P.isUnaffected()).collect(Collectors.toSet());
if (affected.isEmpty()) {
LOG.error("No Affected individual");
return -1;
}
if (unaffected.isEmpty()) {
LOG.error("No unaffected individual");
return -1;
}
final List<String> lookColumns = Arrays.asList("CCDS", "Feature", "ENSP", "Gene", "HGNC", "HGNC_ID", "SYMBOL", "RefSeq");
final Predicate<Genotype> isWildGenotype = G -> {
if (G == null)
return false;
return G.isHomRef();
};
final Predicate<Genotype> isAltGenotype = G -> {
if (G == null)
return false;
return G.isCalled() && !G.isHomRef();
};
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
while (iter.hasNext()) {
final VariantContext ctx = progress.watch(iter.next());
if (!this.variantFilter.test(ctx))
continue;
final Set<String> genes = new HashSet<>();
for (final String predStr : ctx.getAttributeAsList(vepParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
final VepPredictionParser.VepPrediction pred = vepParser.parseOnePrediction(ctx, predStr);
for (final String col : lookColumns) {
final String token = pred.getByCol(col);
if (!StringUtil.isBlank(token)) {
genes.add(token);
}
}
}
for (final String predStr : ctx.getAttributeAsList(annParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
final AnnPredictionParser.AnnPrediction pred = annParser.parseOnePrediction(predStr);
final String token = pred.getGeneName();
if (!StringUtil.isBlank(token)) {
genes.add(token);
}
}
if (genes.isEmpty())
continue;
final Set<Node> nodes = genes.stream().filter(G -> gene2node.containsKey(G)).flatMap(G -> gene2node.get(G).stream()).collect(Collectors.toSet());
if (nodes.isEmpty())
continue;
final long unaffected_alt = unaffected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isAltGenotype).count();
final long affected_alt = affected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isAltGenotype).count();
/* no informative */
if (unaffected_alt + affected_alt == 0L) {
continue;
}
final long affected_ref = affected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isWildGenotype).count();
final long unaffected_ref = unaffected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isWildGenotype).count();
nodes.stream().forEach(N -> N.resetVisitedFlag());
nodes.stream().forEach(N -> N.visit(unaffected_ref, unaffected_alt, affected_ref, affected_alt));
}
iter.close();
progress.finish();
LOG.info("Calculating Fisher and dumping.. please wait");
final PrintWriter pw = super.openFileOrStdoutAsPrintWriter(this.outputFile);
pw.println("#go_term\tfisher\tname\tgo_term_depth\tcount_genes_in_this_node" + "\tunaffected_ref_gt" + "\tunaffected_alt_gt" + "\taffected_ref_gt" + "\taffected_alt_gt");
term2node.values().stream().filter(N -> this.show_never_seeen_term || N.sum() > 0L).sorted((n1, n2) -> Double.compare(n1.fisher(), n2.fisher())).forEach(N -> {
pw.print(N.goTerm.getAcn());
pw.print('\t');
pw.print(N.fisher());
pw.print("\t");
pw.print(N.goTerm.getName().replaceAll("[ \',\\-]+", "_"));
pw.print("\t");
pw.print(N.goTerm.getMinDepth());
pw.print('\t');
pw.print(N.numGenes);
pw.print('\t');
pw.print(N.unaffected_ref);
pw.print('\t');
pw.print(N.unaffected_alt);
pw.print('\t');
pw.print(N.affected_ref);
pw.print('\t');
pw.print(N.affected_alt);
pw.println();
});
pw.flush();
pw.close();
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
}
}
use of com.github.lindenb.jvarkit.go.GOParser in project jvarkit by lindenb.
the class GoUtils method doWork.
@Override
public int doWork(final List<String> args) {
try {
this.mainGoTree = new GOParser().setDebug(this.do_debug).parseOBO(this.goURI);
final Map<GOOntology.Term, UserTerm> userTerms = new HashMap<>();
for (final String s : this.userAccStrings) {
if (StringUtil.isBlank(s))
continue;
final GOOntology.Term t = this.mainGoTree.getTermByAccessionOrName(s);
if (t == null) {
LOG.error("cannot find user term \"" + s + "\"");
return -1;
}
userTerms.put(t, new UserTerm(t));
}
final Predicate<GOOntology.Term> keepTerm = T -> {
boolean keep = false;
if (userTerms.isEmpty()) {
keep = true;
} else if (userTerms.keySet().stream().anyMatch(USERTERM -> (T.isDescendantOf(USERTERM)))) {
keep = true;
}
if (this.inverse)
keep = !keep;
return keep;
};
if (this.accessionFile != null) {
final ColorUtils colorUtils = new ColorUtils();
try (BufferedReader r = IOUtils.openPathForBufferedReading(this.accessionFile)) {
String line;
while ((line = r.readLine()) != null) {
if (line.isEmpty() || line.startsWith("#"))
continue;
int last = 0;
for (last = 0; last < line.length(); ++last) {
if (Character.isWhitespace(line.charAt(last)))
break;
}
final String s = line.substring(0, last);
GOOntology.Term t = this.mainGoTree.getTermByAccessionOrName(s);
if (t == null) {
LOG.error("In " + this.accessionFile + " cannot find user term \"" + s + "\"");
return -1;
}
final UserTerm ut = new UserTerm(t);
userTerms.put(t, ut);
switch(this.action) {
case dump_gexf:
{
for (final String left : line.substring(last).trim().split("[ \t;]+")) {
if (left.isEmpty()) {
// cont
} else if (left.startsWith("color=") && ut.vizColor == null) {
ut.vizColor = colorUtils.parse(left.substring(6));
} else if (left.startsWith("size=") && ut.vizSize == null) {
ut.vizSize = Double.parseDouble(left.substring(5));
} else {
LOG.warning("Ignoring unknown modifier " + left + " in " + line);
}
}
break;
}
default:
break;
}
}
}
}
switch(this.action) {
case dump_gexf:
{
final XMLOutputFactory xof = XMLOutputFactory.newFactory();
XMLStreamWriter w = null;
FileWriter fw = null;
if (this.outputFile == null) {
w = xof.createXMLStreamWriter(stdout(), "UTF-8");
} else {
w = xof.createXMLStreamWriter((fw = new FileWriter(this.outputFile)));
}
final Function<GOOntology.Term, String> term2str = T -> T.getAcn().replaceAll("[\\:_#]+", "_");
w.writeStartDocument("UTF-8", "1.0");
w.writeStartElement("gexf");
w.writeAttribute("xmlns", GexfConstants.XMLNS);
w.writeAttribute("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance");
w.writeAttribute("xmlns:viz", GexfConstants.XMLNS_VIZ);
w.writeAttribute("xsi:schemaLocation", GexfConstants.XSI_SCHEMA_LOCATION);
w.writeAttribute("version", GexfConstants.VERSION);
w.writeStartElement("meta");
w.writeStartElement("creator");
w.writeCharacters(getClass().getName() + " by Pierre Lindenbaum");
w.writeEndElement();
w.writeStartElement("description");
w.writeCharacters("Gene Ontology Tree to Gexf :" + getProgramCommandLine());
w.writeEndElement();
// meta
w.writeEndElement();
w.writeStartElement("graph");
w.writeAttribute("mode", "static");
w.writeAttribute("defaultedgetype", "directed");
w.writeStartElement("attributes");
w.writeAttribute("class", "edge");
w.writeAttribute("mode", "static");
// attributes
w.writeEndElement();
w.writeStartElement("attributes");
w.writeAttribute("class", "node");
w.writeAttribute("mode", "static");
w.writeEmptyElement("attribute");
w.writeAttribute("id", "0");
w.writeAttribute("title", "description");
w.writeAttribute("type", "string");
w.writeEmptyElement("attribute");
w.writeAttribute("id", "1");
w.writeAttribute("title", "accession");
w.writeAttribute("type", "string");
w.writeEmptyElement("attribute");
w.writeAttribute("id", "2");
w.writeAttribute("title", "userTerm");
w.writeAttribute("type", "boolean");
w.writeEmptyElement("attribute");
w.writeAttribute("id", "3");
w.writeAttribute("title", "parentOfUserTerm");
w.writeAttribute("type", "boolean");
w.writeEmptyElement("attribute");
w.writeAttribute("id", "4");
w.writeAttribute("title", "childOffUserTerm");
w.writeAttribute("type", "boolean");
w.writeEmptyElement("attribute");
w.writeAttribute("id", "5");
w.writeAttribute("title", "division");
w.writeAttribute("type", "boolean");
// attributes
w.writeEndElement();
w.writeStartElement("nodes");
w.writeAttribute("count", String.valueOf(this.mainGoTree.size()));
for (final GOOntology.Term term : this.mainGoTree.getTerms()) {
final UserTerm ut = userTerms.get(term);
w.writeStartElement("node");
w.writeAttribute("id", term2str.apply(term));
w.writeAttribute("label", term.getName());
w.writeStartElement("attvalues");
w.writeEmptyElement("attvalue");
w.writeAttribute("for", "0");
w.writeAttribute("value", term.getDefinition());
w.writeEmptyElement("attvalue");
w.writeAttribute("for", "1");
w.writeAttribute("value", term.getAcn());
w.writeEmptyElement("attvalue");
w.writeAttribute("for", "2");
w.writeAttribute("value", String.valueOf(ut != null));
w.writeEmptyElement("attvalue");
// is parent of any user term
w.writeAttribute("for", "3");
w.writeAttribute("value", String.valueOf(userTerms.keySet().stream().anyMatch(T -> T.isDescendantOf(term))));
w.writeEmptyElement("attvalue");
// is child of any user term
w.writeAttribute("for", "4");
w.writeAttribute("value", String.valueOf(userTerms.keySet().stream().anyMatch(T -> term.isDescendantOf(T))));
w.writeEmptyElement("attvalue");
w.writeAttribute("for", "5");
w.writeAttribute("value", term.getDivision() == null ? "." : term.getDivision().name());
// attvalues
w.writeEndElement();
double viz_size = 1.0;
if (ut != null) {
if (ut.vizSize != null) {
viz_size = ut.vizSize;
}
if (ut.vizColor != null) {
// viz:color
w.writeEmptyElement("viz:color");
w.writeAttribute("r", String.valueOf(ut.vizColor.getRed()));
w.writeAttribute("g", String.valueOf(ut.vizColor.getGreen()));
w.writeAttribute("b", String.valueOf(ut.vizColor.getBlue()));
w.writeAttribute("a", String.valueOf("1.0"));
}
}
w.writeEmptyElement("viz:size");
w.writeAttribute("value", String.valueOf(viz_size));
// node
w.writeEndElement();
}
// nodes
w.writeEndElement();
w.writeStartElement("edges");
w.writeAttribute("count", String.valueOf(this.mainGoTree.getTerms().stream().mapToInt(N -> N.getRelations().size()).sum()));
for (final GOOntology.Term term : this.mainGoTree.getTerms()) {
for (final GOOntology.Relation rel : term.getRelations()) {
w.writeStartElement("edge");
w.writeAttribute("id", "E" + term2str.apply(term) + "_" + term2str.apply(rel.getTo()));
w.writeAttribute("type", "directed");
w.writeAttribute("source", term2str.apply(term));
w.writeAttribute("target", term2str.apply(rel.getTo()));
w.writeAttribute("label", rel.getType());
w.writeAttribute("weight", String.valueOf(1));
final Color vizColor = Color.BLACK;
// viz:color
w.writeEmptyElement("viz:color");
w.writeAttribute("r", String.valueOf(vizColor.getRed()));
w.writeAttribute("g", String.valueOf(vizColor.getGreen()));
w.writeAttribute("b", String.valueOf(vizColor.getBlue()));
w.writeAttribute("a", String.valueOf("1.0"));
w.writeEndElement();
}
}
// edges
w.writeEndElement();
// graph
w.writeEndElement();
// gexf
w.writeEndElement();
w.writeEndDocument();
w.flush();
if (fw != null) {
fw.flush();
CloserUtil.close(fw);
} else {
System.out.flush();
}
break;
}
case goa:
{
if (!args.isEmpty()) {
LOG.error("too many arguments");
return -1;
}
final String input;
if (StringUtil.isBlank(this.goaURI)) {
input = oneFileOrNull(args);
} else {
input = this.goaURI;
}
final Set<String> acns_set = this.mainGoTree.getTerms().stream().filter(keepTerm).map(T -> T.getAcn()).collect(Collectors.toSet());
try (BufferedReader br = IOUtils.openURIForBufferedReading(this.goaURI)) {
try (GOAFileIterator goain = GOAFileIterator.newInstance(br)) {
try (PrintWriter out = super.openFileOrStdoutAsPrintWriter(this.outputFile)) {
while (goain.hasNext()) {
final GOAFileIterator.GafRecord rec = goain.next();
if (rec.getQualifiers().contains("NOT"))
continue;
if (!acns_set.contains(rec.getGoId()))
continue;
out.println(rec.toString());
}
out.flush();
}
}
}
break;
}
case gff3:
{
if (!args.isEmpty()) {
LOG.error("too many arguments");
return -1;
}
if (StringUtil.isBlank(this.goaURI)) {
LOG.error("undefined GOA-URI");
return -1;
}
final String input;
if (!StringUtils.isBlank(this.gffPath)) {
input = oneFileOrNull(args);
} else {
input = this.gffPath;
}
final Set<String> acns_set = this.mainGoTree.getTerms().stream().filter(keepTerm).map(T -> T.getAcn()).collect(Collectors.toSet());
final Set<String> geneNames = new HashSet<>();
try (BufferedReader br = IOUtils.openURIForBufferedReading(this.goaURI)) {
try (GOAFileIterator goain = GOAFileIterator.newInstance(br)) {
while (goain.hasNext()) {
final GOAFileIterator.GafRecord rec = goain.next();
if (rec.getQualifiers().contains("NOT"))
continue;
if (!acns_set.contains(rec.getGoId()))
continue;
geneNames.add(rec.getObjectSymbol());
}
}
}
final Gff3Codec gff3 = new Gff3Codec(DecodeDepth.DEEP);
try (InputStream is = (input == null ? stdin() : IOUtils.openURIForReading(input))) {
final AsciiLineReader asciiLineReader = AsciiLineReader.from(is);
final LineIterator lr = new LineIteratorImpl(asciiLineReader);
try (OutputStream out = super.openFileOrStdoutAsStream(this.outputFile)) {
Gff3Writer gw = new Gff3Writer(out);
while (!gff3.isDone(lr)) {
dumpGff3(gw, gff3.decode(lr), geneNames);
}
out.flush();
}
gff3.close(lr);
asciiLineReader.close();
}
break;
}
// through
case dump_table:
default:
{
if (!args.isEmpty()) {
LOG.error("too many arguments");
return -1;
}
try (PrintWriter out = super.openFileOrStdoutAsPrintWriter(this.outputFile)) {
out.println("#ACN\tNAME\tDEFINITION\tDIVISION");
for (final GOOntology.Term t : this.mainGoTree.getTerms()) {
if (keepTerm.test(t)) {
out.print(t.getAcn());
out.print('\t');
out.print(t.getName());
out.print('\t');
out.print(t.getDefinition());
out.print('\t');
out.print(t.getDivision() == null ? "." : t.getDivision().name());
out.println();
}
}
out.flush();
}
break;
}
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
Aggregations