use of com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser in project jvarkit by lindenb.
the class VcfBurdenFilterGenes method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VcfIterator in, final VariantContextWriter out) {
final VCFHeader header = in.getHeader();
try {
final VCFHeader h2 = addMetaData(new VCFHeader(header));
final VCFFilterHeaderLine filterControlsHeader;
if (!StringUtil.isBlank(this.filterTag)) {
filterControlsHeader = new VCFFilterHeaderLine(this.filterTag.trim(), "Genes not in list " + this.geneFile);
h2.addMetaDataLine(filterControlsHeader);
} else {
filterControlsHeader = null;
}
final List<String> lookColumns = Arrays.asList("CCDS", "Feature", "ENSP", "Gene", "HGNC", "HGNC_ID", "SYMBOL", "RefSeq");
final VepPredictionParser vepParser = new VepPredictionParserFactory(header).get();
final AnnPredictionParser annParser = new AnnPredictionParserFactory(header).get();
final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(header.getSequenceDictionary()).logger(LOG);
out.writeHeader(h2);
while (in.hasNext() && !out.checkError()) {
final VariantContext ctx = progess.watch(in.next());
boolean keep = false;
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
// not just set FILTER ?
if (filterControlsHeader == null) {
vcb.rmAttribute(vepParser.getTag());
vcb.rmAttribute(annParser.getTag());
}
final List<String> newVepList = new ArrayList<>();
for (final String predStr : ctx.getAttributeAsList(vepParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
final VepPredictionParser.VepPrediction pred = vepParser.parseOnePrediction(ctx, predStr);
for (final String col : lookColumns) {
final String token = pred.getByCol(col);
if (!StringUtil.isBlank(token) && this.geneNames.contains(token)) {
newVepList.add(predStr);
keep = true;
break;
}
}
}
final List<String> newEffList = new ArrayList<>();
for (final String predStr : ctx.getAttributeAsList(annParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
final AnnPredictionParser.AnnPrediction pred = annParser.parseOnePrediction(predStr);
final String token = pred.getGeneName();
if (!StringUtil.isBlank(token) && this.geneNames.contains(token)) {
newEffList.add(predStr);
keep = true;
break;
}
}
// not just set FILTER ?
if (filterControlsHeader == null) {
if (!newVepList.isEmpty())
vcb.attribute(vepParser.getTag(), newVepList);
if (!newEffList.isEmpty())
vcb.attribute(annParser.getTag(), newEffList);
}
if (filterControlsHeader != null) {
if (!keep) {
vcb.filter(filterControlsHeader.getID());
} else if (!ctx.isFiltered()) {
vcb.passFilters();
}
out.add(vcb.make());
} else {
if (keep)
out.add(vcb.make());
}
}
progess.finish();
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(in);
}
}
use of com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser in project jvarkit by lindenb.
the class VCFComposite method doWork.
@Override
public int doWork(final List<String> args) {
PrintWriter out = null;
try {
out = super.openFileOrStdoutAsPrintWriter(this.outputFile);
if (listModels) {
for (final Type t : Type.values()) {
out.println(t.name());
out.println("\t" + t.getDescription());
}
out.flush();
return 0;
}
this.pedigree = Pedigree.newParser().parse(pedigreeFile);
if (this.pedigree.getAffected().isEmpty()) {
LOG.error("No Affected sample in " + this.pedigreeFile);
return -1;
}
if (this.pedigree.getUnaffected().isEmpty()) {
LOG.error("No Unaffected sample in " + this.pedigreeFile);
return -1;
}
final DiseaseModel model = this.createModel();
final String inputName = super.oneFileOrNull(args);
final LineIterator r = (inputName == null ? IOUtils.openStreamForLineIterator(stdin()) : IOUtils.openURIForLineIterator(inputName));
final VCFCodec codec = new VCFCodec();
final VCFHeader header = (VCFHeader) codec.readActualHeader(r);
final AnnPredictionParser annParser = new AnnPredictionParserFactory(header).get();
final VepPredictionParser vepParser = new VepPredictionParserFactory(header).get();
// final VCFHeader h2=new VCFHeader(header.getMetaDataInInputOrder(),header.getSampleNamesInOrder());
// h2.addMetaDataLine(new VCFInfoHeaderLine(this.TAG,1,VCFHeaderLineType.String,"Values from bigwig file: "+BIGWIG));
SortingCollection<GeneAndVariant> sorting = null;
String prevContig = null;
for (; ; ) {
String line;
final VariantContext ctx;
if (r.hasNext()) {
line = r.next();
ctx = codec.decode(line);
} else {
line = null;
ctx = null;
}
if (ctx == null || !ctx.getContig().equals(prevContig)) {
if (sorting != null) {
LOG.debug("Dump contig " + prevContig);
sorting.doneAdding();
CloseableIterator<GeneAndVariant> iter2 = sorting.iterator();
EqualRangeIterator<GeneAndVariant> eqiter = new EqualRangeIterator<>(iter2, (A, B) -> A.gene.compareTo(B.gene));
while (eqiter.hasNext()) {
final List<GeneAndVariant> variants = eqiter.next();
model.scan(variants.get(0).gene, variants.stream().map(L -> codec.decode(L.ctxLine)).collect(Collectors.toList()), out);
}
eqiter.close();
iter2.close();
sorting.cleanup();
}
sorting = null;
if (ctx == null)
break;
prevContig = ctx.getContig();
}
if (!ctx.isVariant())
continue;
if (!acceptFiltered && ctx.isFiltered())
continue;
if (!acceptID && ctx.hasID())
continue;
if (!model.accept(ctx))
continue;
final Set<String> geneKeys = new HashSet<>();
for (final AnnPredictionParser.AnnPrediction pred : annParser.getPredictions(ctx)) {
geneKeys.addAll(pred.getGeneKeys().stream().map(S -> ctx.getContig() + "_" + S).collect(Collectors.toSet()));
}
for (final VepPredictionParser.VepPrediction pred : vepParser.getPredictions(ctx)) {
geneKeys.addAll(pred.getGeneKeys().stream().map(S -> ctx.getContig() + "_" + S).collect(Collectors.toSet()));
}
if (sorting == null) {
sorting = SortingCollection.newInstance(GeneAndVariant.class, new GeneAndVariantCodec(), (A, B) -> {
int i = A.gene.compareTo(B.gene);
if (i != 0)
return i;
return A.ctxLine.compareTo(B.ctxLine);
}, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sorting.setDestructiveIteration(true);
}
for (final String gk : geneKeys) {
final GeneAndVariant gav = new GeneAndVariant();
gav.gene = gk;
gav.ctxLine = line;
sorting.add(gav);
}
}
out.flush();
out.close();
out = null;
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
}
}
use of com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser in project jvarkit by lindenb.
the class TestNg01 method testVcfFilterSo.
@Test
public void testVcfFilterSo() throws IOException {
File output = new File(TEST_RESULTS_DIR, "jeter.filrerso.vcf");
final AnnPredictionParser parser = new AnnPredictionParserFactory().createDefaultParser();
final SequenceOntologyTree tree = SequenceOntologyTree.getInstance();
String acn = "SO:0001583";
final SequenceOntologyTree.Term term = tree.getTermByAcn(acn);
final Set<SequenceOntologyTree.Term> terms = term.getAllDescendants();
Assert.assertNotNull(term);
Assert.assertTrue(terms.size() > 1);
Assert.assertTrue(terms.contains(term));
Assert.assertEquals(0, new VcfFilterSequenceOntology().instanceMain(new String[] { "-o", output.getPath(), "-A", acn, VCF01 }));
streamVcf(output).forEach(V -> {
// System.err.println(V.getAttribute("ANN")+" vs "+ terms);
Assert.assertTrue(parser.getPredictions(V).stream().flatMap(P -> P.getSOTerms().stream()).anyMatch(T -> terms.contains(T)));
});
Assert.assertEquals(0, new VcfFilterSequenceOntology().instanceMain(new String[] { "-o", output.getPath(), "-A", acn, "--rmatt", "--invert", VCF01 }));
streamVcf(output).forEach(V -> {
Assert.assertFalse(parser.getPredictions(V).stream().flatMap(P -> P.getSOTerms().stream()).anyMatch(T -> terms.contains(T)));
});
Assert.assertEquals(0, new VcfFilterSequenceOntology().instanceMain(new String[] { "-o", output.getPath(), "-A", acn, "--rmatt", VCF01 }));
Assert.assertTrue(streamVcf(output).findAny().isPresent());
Assert.assertTrue(output.delete());
}
use of com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser in project jvarkit by lindenb.
the class VcfStage method buildAnnTableRow.
private TableView<AnnPredictionParser.AnnPrediction> buildAnnTableRow(final AnnPredictionParser parser) {
final TableView<AnnPredictionParser.AnnPrediction> table = new TableView<>();
if (parser.isValid()) {
table.getColumns().add(makeColumn("SO", P -> P.getSOTermsString()));
table.getColumns().add(makeColumn("Allele", P -> P.getAllele()));
table.getColumns().add(makeColumn("Impact", P -> P.getPutativeImpact()));
table.getColumns().add(makeColumn("GeneName", P -> P.getGeneName()));
table.getColumns().add(makeColumn("GeneId", P -> P.getGeneId()));
table.getColumns().add(makeColumn("Feature", P -> P.getFeatureType()));
table.getColumns().add(makeColumn("FeatureId", P -> P.getFeatureId()));
table.getColumns().add(makeColumn("Biotype", P -> P.getTranscriptBioType()));
table.getColumns().add(makeColumn("HGVsc", P -> P.getHGVSc()));
table.getColumns().add(makeColumn("Rank", P -> P.getRank()));
table.getColumns().add(makeColumn("cDNA-pos", P -> P.getCDNAPos()));
table.getColumns().add(makeColumn("CDS-pos", P -> P.getCDSPos()));
table.getColumns().add(makeColumn("AA-pos", P -> P.getAAPos()));
table.getColumns().add(makeColumn("Distance", P -> P.getDistance()));
table.getColumns().add(makeColumn("Msg", P -> P.getMessages()));
}
table.setPlaceholder(new Label("No ANN prediction available"));
return table;
}
use of com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser in project jvarkit by lindenb.
the class VcfBurdenGoEnrichment method doWork.
@Override
public int doWork(final List<String> args) {
if (StringUtil.isBlank(this.readingGo.goUri)) {
LOG.error("Undefined GOs uri.");
return -1;
}
if (this.geneFile == null || !this.geneFile.exists()) {
LOG.error("Undefined gene file option.");
return -1;
}
try {
final GoTree gotree = this.readingGo.createParser().setIgnoreDbXRef(true).parse(this.readingGo.goUri);
List<GoTree.Term> terms = new ArrayList<>(gotree.getTerms());
final Map<GoTree.Term, Node> term2node = new HashMap<>();
// build the node TREE
while (!terms.isEmpty()) {
int i = 0;
while (i < terms.size()) {
final GoTree.Term t = terms.get(i);
if (!t.hasRelations()) {
term2node.put(t, new Node(t));
terms.remove(i);
} else if (t.getRelations().stream().allMatch(L -> term2node.containsKey(L.getTo()))) {
final Node n = new Node(t);
n.parents.addAll(t.getRelations().stream().map(L -> term2node.get(L.getTo())).collect(Collectors.toSet()));
term2node.put(t, n);
terms.remove(i);
} else {
i++;
}
}
}
terms = null;
final Set<String> unknownAcn = new HashSet<>();
final Map<String, Set<Node>> gene2node = new HashMap<>();
final BufferedReader r = IOUtils.openFileForBufferedReading(this.geneFile);
String line;
while ((line = r.readLine()) != null) {
if (line.isEmpty() || line.startsWith("#"))
continue;
final int t = line.indexOf('\t');
if (t == -1) {
r.close();
LOG.error("tab missing in " + line + " of " + this.geneFile);
return -1;
}
final String gene = line.substring(0, t).trim();
if (StringUtil.isBlank(gene)) {
r.close();
LOG.error("Emtpy gene in " + line);
return -1;
}
// using getTermByName because found sysnonym in GOA
final String termAcn = line.substring(t + 1).trim();
if (unknownAcn.contains(termAcn))
continue;
final GoTree.Term term = gotree.getTermByName(termAcn);
if (term == null && !unknownAcn.contains(termAcn)) {
unknownAcn.add(termAcn);
LOG.warning("Don't know this GO term in " + line + " of " + this.geneFile + ". Could be obsolete, synonym, go specific division. Skipping.");
continue;
}
final Node node = term2node.get(term);
if (node == null) {
r.close();
LOG.error("Don't know this node in " + line + " of " + this.geneFile);
return -1;
}
Set<Node> nodes = gene2node.get(gene);
if (nodes == null) {
nodes = new HashSet<>();
gene2node.put(gene, nodes);
}
node.numGenes++;
nodes.add(node);
}
;
// clean up
unknownAcn.clear();
r.close();
final VcfIterator iter = openVcfIterator(oneFileOrNull(args));
final VCFHeader header = iter.getHeader();
final VepPredictionParser vepParser = new VepPredictionParserFactory(header).get();
final AnnPredictionParser annParser = new AnnPredictionParserFactory(header).get();
final Set<Pedigree.Person> persons;
if (this.pedFile != null) {
final Pedigree pedigree = Pedigree.newParser().parse(this.pedFile);
persons = new Pedigree.CaseControlExtractor().extract(header, pedigree);
} else {
persons = new Pedigree.CaseControlExtractor().extract(header);
}
final Set<Pedigree.Person> affected = persons.stream().filter(P -> P.isAffected()).collect(Collectors.toSet());
final Set<Pedigree.Person> unaffected = persons.stream().filter(P -> P.isUnaffected()).collect(Collectors.toSet());
if (affected.isEmpty()) {
LOG.error("No Affected individual");
return -1;
}
if (unaffected.isEmpty()) {
LOG.error("No unaffected individual");
return -1;
}
final List<String> lookColumns = Arrays.asList("CCDS", "Feature", "ENSP", "Gene", "HGNC", "HGNC_ID", "SYMBOL", "RefSeq");
final Predicate<Genotype> isWildGenotype = G -> {
if (G == null)
return false;
return G.isHomRef();
};
final Predicate<Genotype> isAltGenotype = G -> {
if (G == null)
return false;
return G.isCalled() && !G.isHomRef();
};
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
while (iter.hasNext()) {
final VariantContext ctx = progress.watch(iter.next());
if (!this.variantFilter.test(ctx))
continue;
final Set<String> genes = new HashSet<>();
for (final String predStr : ctx.getAttributeAsList(vepParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
final VepPredictionParser.VepPrediction pred = vepParser.parseOnePrediction(ctx, predStr);
for (final String col : lookColumns) {
final String token = pred.getByCol(col);
if (!StringUtil.isBlank(token)) {
genes.add(token);
}
}
}
for (final String predStr : ctx.getAttributeAsList(annParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
final AnnPredictionParser.AnnPrediction pred = annParser.parseOnePrediction(predStr);
final String token = pred.getGeneName();
if (!StringUtil.isBlank(token)) {
genes.add(token);
}
}
if (genes.isEmpty())
continue;
final Set<Node> nodes = genes.stream().filter(G -> gene2node.containsKey(G)).flatMap(G -> gene2node.get(G).stream()).collect(Collectors.toSet());
if (nodes.isEmpty())
continue;
final long unaffected_alt = unaffected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isAltGenotype).count();
final long affected_alt = affected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isAltGenotype).count();
/* no informative */
if (unaffected_alt + affected_alt == 0L) {
continue;
}
final long affected_ref = affected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isWildGenotype).count();
final long unaffected_ref = unaffected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isWildGenotype).count();
nodes.stream().forEach(N -> N.resetVisitedFlag());
nodes.stream().forEach(N -> N.visit(unaffected_ref, unaffected_alt, affected_ref, affected_alt));
}
iter.close();
progress.finish();
LOG.info("Calculating Fisher and dumping.. please wait");
final PrintWriter pw = super.openFileOrStdoutAsPrintWriter(this.outputFile);
pw.println("#go_term\tfisher\tname\tgo_term_depth\tcount_genes_in_this_node" + "\tunaffected_ref_gt" + "\tunaffected_alt_gt" + "\taffected_ref_gt" + "\taffected_alt_gt");
term2node.values().stream().filter(N -> this.show_never_seeen_term || N.sum() > 0L).sorted((n1, n2) -> Double.compare(n1.fisher(), n2.fisher())).forEach(N -> {
pw.print(N.goTerm.getAcn());
pw.print('\t');
pw.print(N.fisher());
pw.print("\t");
pw.print(N.goTerm.getName().replaceAll("[ \',\\-]+", "_"));
pw.print("\t");
pw.print(N.goTerm.getMinDepth());
pw.print('\t');
pw.print(N.numGenes);
pw.print('\t');
pw.print(N.unaffected_ref);
pw.print('\t');
pw.print(N.unaffected_alt);
pw.print('\t');
pw.print(N.affected_ref);
pw.print('\t');
pw.print(N.affected_alt);
pw.println();
});
pw.flush();
pw.close();
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
}
}
Aggregations