use of com.github.lindenb.jvarkit.util.go.GoTree in project jvarkit by lindenb.
the class VcfBurdenGoEnrichment method doWork.
@Override
public int doWork(final List<String> args) {
if (StringUtil.isBlank(this.readingGo.goUri)) {
LOG.error("Undefined GOs uri.");
return -1;
}
if (this.geneFile == null || !this.geneFile.exists()) {
LOG.error("Undefined gene file option.");
return -1;
}
try {
final GoTree gotree = this.readingGo.createParser().setIgnoreDbXRef(true).parse(this.readingGo.goUri);
List<GoTree.Term> terms = new ArrayList<>(gotree.getTerms());
final Map<GoTree.Term, Node> term2node = new HashMap<>();
// build the node TREE
while (!terms.isEmpty()) {
int i = 0;
while (i < terms.size()) {
final GoTree.Term t = terms.get(i);
if (!t.hasRelations()) {
term2node.put(t, new Node(t));
terms.remove(i);
} else if (t.getRelations().stream().allMatch(L -> term2node.containsKey(L.getTo()))) {
final Node n = new Node(t);
n.parents.addAll(t.getRelations().stream().map(L -> term2node.get(L.getTo())).collect(Collectors.toSet()));
term2node.put(t, n);
terms.remove(i);
} else {
i++;
}
}
}
terms = null;
final Set<String> unknownAcn = new HashSet<>();
final Map<String, Set<Node>> gene2node = new HashMap<>();
final BufferedReader r = IOUtils.openFileForBufferedReading(this.geneFile);
String line;
while ((line = r.readLine()) != null) {
if (line.isEmpty() || line.startsWith("#"))
continue;
final int t = line.indexOf('\t');
if (t == -1) {
r.close();
LOG.error("tab missing in " + line + " of " + this.geneFile);
return -1;
}
final String gene = line.substring(0, t).trim();
if (StringUtil.isBlank(gene)) {
r.close();
LOG.error("Emtpy gene in " + line);
return -1;
}
// using getTermByName because found sysnonym in GOA
final String termAcn = line.substring(t + 1).trim();
if (unknownAcn.contains(termAcn))
continue;
final GoTree.Term term = gotree.getTermByName(termAcn);
if (term == null && !unknownAcn.contains(termAcn)) {
unknownAcn.add(termAcn);
LOG.warning("Don't know this GO term in " + line + " of " + this.geneFile + ". Could be obsolete, synonym, go specific division. Skipping.");
continue;
}
final Node node = term2node.get(term);
if (node == null) {
r.close();
LOG.error("Don't know this node in " + line + " of " + this.geneFile);
return -1;
}
Set<Node> nodes = gene2node.get(gene);
if (nodes == null) {
nodes = new HashSet<>();
gene2node.put(gene, nodes);
}
node.numGenes++;
nodes.add(node);
}
;
// clean up
unknownAcn.clear();
r.close();
final VcfIterator iter = openVcfIterator(oneFileOrNull(args));
final VCFHeader header = iter.getHeader();
final VepPredictionParser vepParser = new VepPredictionParserFactory(header).get();
final AnnPredictionParser annParser = new AnnPredictionParserFactory(header).get();
final Set<Pedigree.Person> persons;
if (this.pedFile != null) {
final Pedigree pedigree = Pedigree.newParser().parse(this.pedFile);
persons = new Pedigree.CaseControlExtractor().extract(header, pedigree);
} else {
persons = new Pedigree.CaseControlExtractor().extract(header);
}
final Set<Pedigree.Person> affected = persons.stream().filter(P -> P.isAffected()).collect(Collectors.toSet());
final Set<Pedigree.Person> unaffected = persons.stream().filter(P -> P.isUnaffected()).collect(Collectors.toSet());
if (affected.isEmpty()) {
LOG.error("No Affected individual");
return -1;
}
if (unaffected.isEmpty()) {
LOG.error("No unaffected individual");
return -1;
}
final List<String> lookColumns = Arrays.asList("CCDS", "Feature", "ENSP", "Gene", "HGNC", "HGNC_ID", "SYMBOL", "RefSeq");
final Predicate<Genotype> isWildGenotype = G -> {
if (G == null)
return false;
return G.isHomRef();
};
final Predicate<Genotype> isAltGenotype = G -> {
if (G == null)
return false;
return G.isCalled() && !G.isHomRef();
};
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
while (iter.hasNext()) {
final VariantContext ctx = progress.watch(iter.next());
if (!this.variantFilter.test(ctx))
continue;
final Set<String> genes = new HashSet<>();
for (final String predStr : ctx.getAttributeAsList(vepParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
final VepPredictionParser.VepPrediction pred = vepParser.parseOnePrediction(ctx, predStr);
for (final String col : lookColumns) {
final String token = pred.getByCol(col);
if (!StringUtil.isBlank(token)) {
genes.add(token);
}
}
}
for (final String predStr : ctx.getAttributeAsList(annParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
final AnnPredictionParser.AnnPrediction pred = annParser.parseOnePrediction(predStr);
final String token = pred.getGeneName();
if (!StringUtil.isBlank(token)) {
genes.add(token);
}
}
if (genes.isEmpty())
continue;
final Set<Node> nodes = genes.stream().filter(G -> gene2node.containsKey(G)).flatMap(G -> gene2node.get(G).stream()).collect(Collectors.toSet());
if (nodes.isEmpty())
continue;
final long unaffected_alt = unaffected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isAltGenotype).count();
final long affected_alt = affected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isAltGenotype).count();
/* no informative */
if (unaffected_alt + affected_alt == 0L) {
continue;
}
final long affected_ref = affected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isWildGenotype).count();
final long unaffected_ref = unaffected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isWildGenotype).count();
nodes.stream().forEach(N -> N.resetVisitedFlag());
nodes.stream().forEach(N -> N.visit(unaffected_ref, unaffected_alt, affected_ref, affected_alt));
}
iter.close();
progress.finish();
LOG.info("Calculating Fisher and dumping.. please wait");
final PrintWriter pw = super.openFileOrStdoutAsPrintWriter(this.outputFile);
pw.println("#go_term\tfisher\tname\tgo_term_depth\tcount_genes_in_this_node" + "\tunaffected_ref_gt" + "\tunaffected_alt_gt" + "\taffected_ref_gt" + "\taffected_alt_gt");
term2node.values().stream().filter(N -> this.show_never_seeen_term || N.sum() > 0L).sorted((n1, n2) -> Double.compare(n1.fisher(), n2.fisher())).forEach(N -> {
pw.print(N.goTerm.getAcn());
pw.print('\t');
pw.print(N.fisher());
pw.print("\t");
pw.print(N.goTerm.getName().replaceAll("[ \',\\-]+", "_"));
pw.print("\t");
pw.print(N.goTerm.getMinDepth());
pw.print('\t');
pw.print(N.numGenes);
pw.print('\t');
pw.print(N.unaffected_ref);
pw.print('\t');
pw.print(N.unaffected_alt);
pw.print('\t');
pw.print(N.affected_ref);
pw.print('\t');
pw.print(N.affected_alt);
pw.println();
});
pw.flush();
pw.close();
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
}
}
Aggregations