use of com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory in project jvarkit by lindenb.
the class VcfGeneSplitter method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, File outputFile) {
SortingCollection<KeyAndLine> sortingcollection = null;
BufferedReader in = null;
FileOutputStream fos = null;
ZipOutputStream zout = null;
CloseableIterator<KeyAndLine> iter = null;
PrintWriter pw = null;
try {
in = inputName == null ? IOUtils.openStreamForBufferedReader(stdin()) : IOUtils.openURIForBufferedReading(inputName);
final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(in);
/**
* find splitter by name
*/
final VepPredictionParser vepPredictionParser = new VepPredictionParserFactory().header(cah.header).get();
sortingcollection = SortingCollection.newInstance(KeyAndLine.class, new KeyAndLineCodec(), new KeyAndLineComparator(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sortingcollection.setDestructiveIteration(true);
// read variants
final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(cah.header);
String line;
while ((line = in.readLine()) != null) {
final VariantContext ctx = progess.watch(cah.codec.decode(line));
// no check for ctx.ifFiltered here, we do this later.
for (final String key : this.getVariantKeys(vepPredictionParser, ctx)) {
sortingcollection.add(new KeyAndLine(key, line));
}
}
progess.finish();
sortingcollection.doneAdding();
LOG.info("creating zip " + outputFile);
fos = new FileOutputStream(outputFile);
zout = new ZipOutputStream(fos);
final File tmpReportFile = File.createTempFile("_tmp.", ".txt", writingSortingCollection.getTmpDirectories().get(0));
tmpReportFile.deleteOnExit();
pw = IOUtils.openFileForPrintWriter(tmpReportFile);
pw.println("#chrom\tstart\tend\tkey\tCount_Variants");
iter = sortingcollection.iterator();
final EqualRangeIterator<KeyAndLine> eqiter = new EqualRangeIterator<>(iter, new Comparator<KeyAndLine>() {
@Override
public int compare(final KeyAndLine o1, final KeyAndLine o2) {
return o1.key.compareTo(o2.key);
}
});
while (eqiter.hasNext()) {
final List<KeyAndLine> buffer = eqiter.next();
final KeyAndLine first = buffer.get(0);
LOG.info(first.key);
final List<VariantContext> variants = new ArrayList<>(buffer.size());
String contig = null;
int chromStart = Integer.MAX_VALUE;
int chromEnd = 0;
for (final KeyAndLine kal : buffer) {
final VariantContext ctx = cah.codec.decode(kal.ctx);
variants.add(ctx);
contig = ctx.getContig();
chromStart = Math.min(chromStart, ctx.getStart());
chromEnd = Math.max(chromEnd, ctx.getEnd());
}
pw.println(contig + "\t" + (chromStart - 1) + // -1 for bed compatibility
"\t" + chromEnd + "\t" + first.key + "\t" + variants.size());
// save vcf file
final ZipEntry ze = new ZipEntry(this.baseZipDir + "/" + first.key + ".vcf");
zout.putNextEntry(ze);
final VariantContextWriter out = VCFUtils.createVariantContextWriterToOutputStream(IOUtils.uncloseableOutputStream(zout));
final VCFHeader header2 = addMetaData(new VCFHeader(cah.header));
header2.addMetaDataLine(new VCFHeaderLine("VcfGeneSplitter.Name", String.valueOf(first.key)));
out.writeHeader(header2);
for (final VariantContext ctx : variants) {
out.add(ctx);
}
// yes because wrapped into IOUtils.encloseableOutputSream
out.close();
zout.closeEntry();
}
eqiter.close();
iter.close();
iter = null;
progess.finish();
LOG.info("saving report");
pw.flush();
pw.close();
final ZipEntry entry = new ZipEntry(this.baseZipDir + "/manifest.bed");
zout.putNextEntry(entry);
IOUtils.copyTo(tmpReportFile, zout);
zout.closeEntry();
zout.finish();
zout.close();
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
if (sortingcollection != null)
sortingcollection.cleanup();
CloserUtil.close(in);
CloserUtil.close(fos);
CloserUtil.close(pw);
}
}
use of com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory in project jvarkit by lindenb.
the class VCFCompare method doWork.
@Override
public int doWork(final List<String> args) {
if (args.isEmpty()) {
LOG.error("VCFs missing.");
return -1;
}
if (args.size() != 2) {
System.err.println("Illegal number or arguments. Expected two VCFs");
return -1;
}
PrintWriter pw = null;
XMLStreamWriter w = null;
InputStream in = null;
SortingCollection<LineAndFile> variants = null;
try {
LineAndFileComparator varcmp = new LineAndFileComparator();
variants = SortingCollection.newInstance(LineAndFile.class, new LineAndFileCodec(), varcmp, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
variants.setDestructiveIteration(true);
for (int i = 0; i < 2; ++i) {
this.inputs[i] = new Input();
this.inputs[i].codec = VCFUtils.createDefaultVCFCodec();
this.inputs[i].filename = args.get(i);
LOG.info("Opening " + this.inputs[i].filename);
in = IOUtils.openURIForReading(this.inputs[i].filename);
final LineReader lr = new SynchronousLineReader(in);
final LineIterator li = new LineIteratorImpl(lr);
this.inputs[i].header = (VCFHeader) this.inputs[i].codec.readActualHeader(li);
this.inputs[i].vepPredictionParser = new VepPredictionParserFactory(this.inputs[i].header).get();
this.inputs[i].snpEffPredictionParser = new SnpEffPredictionParserFactory(this.inputs[i].header).get();
this.inputs[i].annPredictionParser = new AnnPredictionParserFactory(this.inputs[i].header).get();
while (li.hasNext()) {
LineAndFile laf = new LineAndFile();
laf.fileIdx = i;
laf.line = li.next();
variants.add(laf);
}
LOG.info("Done Reading " + this.inputs[i].filename);
CloserUtil.close(li);
CloserUtil.close(lr);
CloserUtil.close(in);
}
variants.doneAdding();
LOG.info("Done Adding");
Set<String> commonSamples = new TreeSet<String>(this.inputs[0].header.getSampleNamesInOrder());
commonSamples.retainAll(this.inputs[1].header.getSampleNamesInOrder());
List<Venn0> venn1List = new ArrayList<VCFCompare.Venn0>();
venn1List.add(new Venn1("ALL"));
venn1List.add(new Venn1("having ID") {
@Override
public VariantContext filter(VariantContext ctx, int fileIndex) {
return ctx == null || !ctx.hasID() ? null : ctx;
}
});
venn1List.add(new Venn1("QUAL greater 30") {
@Override
public VariantContext filter(VariantContext ctx, int fileIndex) {
return ctx == null || !ctx.hasLog10PError() || ctx.getPhredScaledQual() < 30.0 ? null : ctx;
}
});
for (VariantContext.Type t : VariantContext.Type.values()) {
venn1List.add(new VennType(t));
}
for (SequenceOntologyTree.Term term : SequenceOntologyTree.getInstance().getTerms()) {
venn1List.add(new VennPred("vep", term) {
@Override
Set<Term> terms(VariantContext ctx, int file_id) {
Set<Term> tt = new HashSet<SequenceOntologyTree.Term>();
for (VepPredictionParser.VepPrediction pred : VCFCompare.this.inputs[file_id].vepPredictionParser.getPredictions(ctx)) {
tt.addAll(pred.getSOTerms());
}
return tt;
}
});
venn1List.add(new VennPred("SnpEff", term) {
@Override
Set<Term> terms(VariantContext ctx, int file_id) {
Set<Term> tt = new HashSet<SequenceOntologyTree.Term>();
for (SnpEffPredictionParser.SnpEffPrediction pred : VCFCompare.this.inputs[file_id].snpEffPredictionParser.getPredictions(ctx)) {
tt.addAll(pred.getSOTerms());
}
return tt;
}
});
venn1List.add(new VennPred("ANN", term) {
@Override
Set<Term> terms(VariantContext ctx, int file_id) {
Set<Term> tt = new HashSet<SequenceOntologyTree.Term>();
for (AnnPredictionParser.AnnPrediction pred : VCFCompare.this.inputs[file_id].annPredictionParser.getPredictions(ctx)) {
tt.addAll(pred.getSOTerms());
}
return tt;
}
});
}
for (String s : commonSamples) {
venn1List.add(new VennGType(s));
}
/* START : digest results ====================== */
Counter<String> diff = new Counter<String>();
List<LineAndFile> row = new ArrayList<LineAndFile>();
CloseableIterator<LineAndFile> iter = variants.iterator();
for (; ; ) {
LineAndFile rec = null;
if (iter.hasNext()) {
rec = iter.next();
}
if (rec == null || (!row.isEmpty() && varcmp.compare(row.get(0), rec) != 0)) {
if (!row.isEmpty()) {
diff.incr("count.variations");
VariantContext[] contexes_init = new VariantContext[] { null, null };
for (LineAndFile var : row) {
if (contexes_init[var.fileIdx] != null) {
LOG.error("Duplicate context in " + inputs[var.fileIdx].filename + " : " + var.line);
continue;
}
contexes_init[var.fileIdx] = var.getContext();
}
for (Venn0 venn : venn1List) {
venn.visit(contexes_init);
}
row.clear();
}
if (rec == null)
break;
}
row.add(rec);
}
iter.close();
/* END : digest results ====================== */
pw = super.openFileOrStdoutAsPrintWriter(outputFile);
XMLOutputFactory xmlfactory = XMLOutputFactory.newInstance();
w = xmlfactory.createXMLStreamWriter(pw);
w.writeStartElement("html");
w.writeStartElement("body");
/* specific samples */
w.writeStartElement("div");
w.writeStartElement("dl");
for (int i = 0; i < 3; ++i) {
String title;
Set<String> samples;
switch(i) {
case 0:
case 1:
title = "Sample(s) for " + this.inputs[i].filename + ".";
samples = new TreeSet<String>(this.inputs[i].header.getSampleNamesInOrder());
samples.removeAll(commonSamples);
break;
default:
title = "Common Sample(s).";
samples = new TreeSet<String>(commonSamples);
break;
}
w.writeStartElement("dt");
w.writeCharacters(title);
w.writeEndElement();
w.writeStartElement("dd");
w.writeStartElement("ol");
for (String s : samples) {
w.writeStartElement("li");
w.writeCharacters(s);
w.writeEndElement();
}
w.writeEndElement();
w.writeEndElement();
}
// dl
w.writeEndElement();
// div
w.writeEndElement();
for (Venn0 v : venn1List) {
v.write(w);
}
// body
w.writeEndElement();
// html
w.writeEndElement();
w.writeEndDocument();
w.close();
w = null;
pw.flush();
pw.close();
pw = null;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(w);
CloserUtil.close(pw);
if (variants != null)
variants.cleanup();
}
return 0;
}
use of com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory in project jvarkit by lindenb.
the class VCFComposite method doWork.
@Override
public int doWork(final List<String> args) {
PrintWriter out = null;
try {
out = super.openFileOrStdoutAsPrintWriter(this.outputFile);
if (listModels) {
for (final Type t : Type.values()) {
out.println(t.name());
out.println("\t" + t.getDescription());
}
out.flush();
return 0;
}
this.pedigree = Pedigree.newParser().parse(pedigreeFile);
if (this.pedigree.getAffected().isEmpty()) {
LOG.error("No Affected sample in " + this.pedigreeFile);
return -1;
}
if (this.pedigree.getUnaffected().isEmpty()) {
LOG.error("No Unaffected sample in " + this.pedigreeFile);
return -1;
}
final DiseaseModel model = this.createModel();
final String inputName = super.oneFileOrNull(args);
final LineIterator r = (inputName == null ? IOUtils.openStreamForLineIterator(stdin()) : IOUtils.openURIForLineIterator(inputName));
final VCFCodec codec = new VCFCodec();
final VCFHeader header = (VCFHeader) codec.readActualHeader(r);
final AnnPredictionParser annParser = new AnnPredictionParserFactory(header).get();
final VepPredictionParser vepParser = new VepPredictionParserFactory(header).get();
// final VCFHeader h2=new VCFHeader(header.getMetaDataInInputOrder(),header.getSampleNamesInOrder());
// h2.addMetaDataLine(new VCFInfoHeaderLine(this.TAG,1,VCFHeaderLineType.String,"Values from bigwig file: "+BIGWIG));
SortingCollection<GeneAndVariant> sorting = null;
String prevContig = null;
for (; ; ) {
String line;
final VariantContext ctx;
if (r.hasNext()) {
line = r.next();
ctx = codec.decode(line);
} else {
line = null;
ctx = null;
}
if (ctx == null || !ctx.getContig().equals(prevContig)) {
if (sorting != null) {
LOG.debug("Dump contig " + prevContig);
sorting.doneAdding();
CloseableIterator<GeneAndVariant> iter2 = sorting.iterator();
EqualRangeIterator<GeneAndVariant> eqiter = new EqualRangeIterator<>(iter2, (A, B) -> A.gene.compareTo(B.gene));
while (eqiter.hasNext()) {
final List<GeneAndVariant> variants = eqiter.next();
model.scan(variants.get(0).gene, variants.stream().map(L -> codec.decode(L.ctxLine)).collect(Collectors.toList()), out);
}
eqiter.close();
iter2.close();
sorting.cleanup();
}
sorting = null;
if (ctx == null)
break;
prevContig = ctx.getContig();
}
if (!ctx.isVariant())
continue;
if (!acceptFiltered && ctx.isFiltered())
continue;
if (!acceptID && ctx.hasID())
continue;
if (!model.accept(ctx))
continue;
final Set<String> geneKeys = new HashSet<>();
for (final AnnPredictionParser.AnnPrediction pred : annParser.getPredictions(ctx)) {
geneKeys.addAll(pred.getGeneKeys().stream().map(S -> ctx.getContig() + "_" + S).collect(Collectors.toSet()));
}
for (final VepPredictionParser.VepPrediction pred : vepParser.getPredictions(ctx)) {
geneKeys.addAll(pred.getGeneKeys().stream().map(S -> ctx.getContig() + "_" + S).collect(Collectors.toSet()));
}
if (sorting == null) {
sorting = SortingCollection.newInstance(GeneAndVariant.class, new GeneAndVariantCodec(), (A, B) -> {
int i = A.gene.compareTo(B.gene);
if (i != 0)
return i;
return A.ctxLine.compareTo(B.ctxLine);
}, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sorting.setDestructiveIteration(true);
}
for (final String gk : geneKeys) {
final GeneAndVariant gav = new GeneAndVariant();
gav.gene = gk;
gav.ctxLine = line;
sorting.add(gav);
}
}
out.flush();
out.close();
out = null;
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
}
}
use of com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory in project jvarkit by lindenb.
the class VcfToRdf method scanVCF.
private void scanVCF(final File filein) throws IOException {
VcfIterator in = null;
URI source = null;
try {
if (filein != null)
source = filein.toURI();
in = (filein == null ? VCFUtils.createVcfIteratorStdin() : VCFUtils.createVcfIteratorFromFile(filein));
final VCFHeader header = in.getHeader();
final VepPredictionParser vepPredictionParser = new VepPredictionParserFactory(header).get();
writeHeader(header, source);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
while (in.hasNext()) {
if (this.w.checkError()) {
LOG.warn("I/O interruption");
break;
}
final VariantContext ctx = progress.watch(in.next());
/* Variant */
final URI variant = URI.create("urn:variant/" + ctx.getContig() + ":" + ctx.getStart() + ":" + ctx.getReference().getBaseString());
emit(variant, "rdf:type", "vcf:Variant", "vcf:chrom", URI.create("urn:chrom/" + ctx.getContig()), "vcf:position", ctx.getStart(), "vcf:ref", ctx.getReference().getBaseString(), "vcf:id", (ctx.hasID() ? ctx.getID() : null), "vcf:qual", (ctx.hasLog10PError() ? ctx.getPhredScaledQual() : null));
if (this.printAlleles) {
for (final Allele alt : ctx.getAlternateAlleles()) {
emit(variant, "vcf:alt", alt.getBaseString());
}
}
if (this.printFilters) {
for (final String f : ctx.getFilters()) {
emit(variant, "vcf:filter", URI.create("urn:filter/" + f));
}
}
if (this.printVep) {
for (final VepPrediction prediction : vepPredictionParser.getPredictions(ctx)) {
/*
final List<Object> L=new ArrayList<>();
L.add("rdf:type");L.add("vep:Prediction");
L.add("vcf:variant"); L.add(variant);
L.add("vcf:allele");L.add(prediction.getAllele().getBaseString());
for(final SequenceOntologyTree.Term term:prediction.getSOTerms())
{
L.add("vcf:so");
L.add(URI.create(term.getUri()));
}
if(prediction.getEnsemblTranscript()!=null)
{
final URI transcriptid=URI.create("http://www.ensembl.org/id/"+prediction.getEnsemblTranscript());
L.add("vep:transcript");
L.add(transcriptid);
if(prediction.getEnsemblGene()!=null)
{
emit(transcriptid,
"uniprot:transcribedFrom",//used in uniprot dump
URI.create("http://www.ensembl.org/id/"+prediction.getEnsemblGene())
);
}
if(prediction.getEnsemblProtein()!=null)
{
emit(
transcriptid,
"uniprot:translatedTo",//used in uniprot dump
URI.create("http://www.ensembl.org/id/"+prediction.getEnsemblProtein())
);
}
}
emit(
URI.create("urn:vep/"+(++id_generator)),
L.toArray()
);
*/
}
}
if (this.printGenotypes) {
for (final String sample : ctx.getSampleNames()) {
final Genotype g = ctx.getGenotype(sample);
final List<Object> L = new ArrayList<>();
L.add("vcf:sample");
L.add(URI.create("urn:sample/" + sample));
L.add("vcf:variant");
L.add(variant);
L.add("rdf:type");
L.add("vcf:Genotype");
if (g.hasDP()) {
L.add("vcf:dp");
L.add(g.getDP());
}
if (g.hasGQ()) {
L.add("vcf:gq");
L.add(g.getGQ());
}
if (g.isCalled()) {
if (g.isHet()) {
if (g.isHetNonRef()) {
L.add("rdf:type");
L.add("vcf:HetNonRefGenotype");
} else {
L.add("rdf:type");
L.add("vcf:HetGenotype");
}
} else if (g.isHom()) {
if (g.isHomRef()) {
L.add("rdf:type");
L.add("vcf:HomRefGenotype");
} else {
L.add("rdf:type");
L.add("vcf:HomVarGenotype");
}
}
for (final Allele a : g.getAlleles()) {
L.add("vcf:allele");
L.add(a.getBaseString());
}
}
emit(URI.create("urn:gt/" + ctx.getContig() + ":" + ctx.getStart() + ":" + ctx.getReference().getBaseString() + ":" + sample), L.toArray());
}
}
}
in.close();
in = null;
progress.finish();
} catch (final Exception e) {
throw new IOException(e);
} finally {
CloserUtil.close(in);
}
}
use of com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory in project jvarkit by lindenb.
the class VcfGeneOntology method filterVcfIterator.
private void filterVcfIterator(final VcfIterator in) throws IOException {
VariantContextWriter w = null;
try {
VCFHeader header = in.getHeader();
VCFHeader h2 = new VCFHeader(header);
h2.addMetaDataLine(new VCFInfoHeaderLine(TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "GO terms from GO " + GO + " and GOA=" + GOA));
h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkVersion", HtsjdkVersion.getVersion()));
h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkHome", HtsjdkVersion.getHome()));
if (filterName != null) {
h2.addMetaDataLine(new VCFFilterHeaderLine(filterName, "Flag GO terms " + (inverse_filter ? " not descendant of " : "") + " the provided GO terms"));
}
w = super.openVariantContextWriter(outputFile);
w.writeHeader(h2);
final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(header.getSequenceDictionary());
final SnpEffPredictionParser snpEffPredictionParser = new SnpEffPredictionParserFactory().header(header).get();
final VepPredictionParser vepPredictionParser = new VepPredictionParserFactory().header(header).get();
while (in.hasNext()) {
if (System.out.checkError())
break;
VariantContext ctx = progess.watch(in.next());
/* symbols for this variant */
Set<String> symbols = new HashSet<String>();
/* scan SNPEFF gene */
for (SnpEffPrediction pred : snpEffPredictionParser.getPredictions(ctx)) {
String genName = pred.getGeneName();
if (genName == null || genName.isEmpty())
continue;
symbols.add(genName);
}
/* scan VEP gene */
for (VepPrediction pred : vepPredictionParser.getPredictions(ctx)) {
String genName = pred.getGeneName();
if (!(genName == null || genName.isEmpty())) {
symbols.add(genName);
}
genName = pred.getGene();
if (!(genName == null || genName.isEmpty())) {
symbols.add(genName);
}
genName = pred.getHGNC();
if (!(genName == null || genName.isEmpty())) {
symbols.add(genName);
}
}
/* only keep known GENES from GOA */
symbols.retainAll(this.name2go.keySet());
boolean found_child_of_filter = false;
/* ATTS */
List<String> atts = new ArrayList<String>();
/* loop over symbols */
for (String symbol : symbols) {
/* go terms associated to this symbol */
Set<GoTree.Term> t2 = this.name2go.get(symbol);
if (t2 == null || t2.isEmpty())
continue;
StringBuilder sb = new StringBuilder(symbol);
sb.append("|");
boolean first = true;
for (GoTree.Term gt : t2) {
/* user gave terms to filter */
if (!found_child_of_filter && this.goTermToFilter != null) {
for (GoTree.Term userTerm : this.goTermToFilter) {
if (userTerm.hasDescendant(gt.getAcn())) {
found_child_of_filter = true;
break;
}
}
}
if (!first)
sb.append("&");
sb.append(gt.getAcn());
first = false;
}
atts.add(sb.toString());
}
/* no go term was found */
if (atts.isEmpty()) {
if (!removeIfNoGo) {
w.add(ctx);
}
continue;
}
VariantContextBuilder vcb = new VariantContextBuilder(ctx);
/* check children of user's terms */
if (this.goTermToFilter != null) {
/* keep if found children*/
if ((this.inverse_filter && found_child_of_filter) || (!this.inverse_filter && !found_child_of_filter)) {
/* don't remove, but set filter */
if (this.filterName != null) {
Set<String> filters = new HashSet<String>(ctx.getFilters());
filters.add(this.filterName);
vcb.filters(filters);
} else {
continue;
}
}
}
/* add go terms */
vcb.attribute(this.TAG, atts);
w.add(vcb.make());
}
progess.finish();
w.close();
w = null;
} finally {
CloserUtil.close(w);
w = null;
}
}
Aggregations