use of com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser.VepPrediction in project jvarkit by lindenb.
the class VcfToSql method read.
private void read(File filename) throws IOException {
/* insert ATGC */
this.alleleTable.insert(outputWriter, null, "A");
this.alleleTable.insert(outputWriter, null, "C");
this.alleleTable.insert(outputWriter, null, "G");
this.alleleTable.insert(outputWriter, null, "T");
/* insert this sample */
this.vcfFileTable.insert(outputWriter, null, filename);
final SelectStmt vcffile_id = new SelectStmt(this.vcfFileTable);
final Map<String, SelectStmt> sample2sampleid = new HashMap<String, SelectStmt>();
final Map<String, SelectStmt> filter2filterid = new HashMap<String, SelectStmt>();
final Map<String, SelectStmt> chrom2chromId = new HashMap<String, SelectStmt>();
final VcfIterator r = VCFUtils.createVcfIteratorFromFile(filename);
final VCFHeader header = r.getHeader();
/* parse samples */
for (final String sampleName : header.getSampleNamesInOrder()) {
this.sampleTable.insert(outputWriter, null, sampleName);
SelectStmt sample_id = new SelectStmt(this.sampleTable, "name", sampleName);
sample2sampleid.put(sampleName, sample_id);
this.sample2fileTable.insert(outputWriter, null, vcffile_id, sample_id);
}
/* parse filters */
for (final VCFFilterHeaderLine filter : header.getFilterLines()) {
this.filterTable.insert(outputWriter, null, vcffile_id, filter.getID(), filter.getValue());
filter2filterid.put(filter.getID(), new SelectStmt(this.filterTable, "name", filter.getID()));
}
filter2filterid.put(VCFConstants.PASSES_FILTERS_v4, new SelectStmt(this.filterTable, "name", VCFConstants.PASSES_FILTERS_v4));
final SAMSequenceDictionary dict = header.getSequenceDictionary();
if (dict == null) {
throw new RuntimeException("dictionary missing in VCF");
}
/* parse sequence dict */
for (final SAMSequenceRecord ssr : dict.getSequences()) {
this.chromosomeTable.insert(outputWriter, null, vcffile_id, ssr.getSequenceName(), ssr.getSequenceLength());
chrom2chromId.put(ssr.getSequenceName(), new SelectStmt(this.chromosomeTable, "name", ssr.getSequenceName()));
}
VepPredictionParser vepPredictionParser = new VepPredictionParserFactory(header).get();
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
int nVariants = 0;
while (r.hasNext()) {
if (this.outputWriter.checkError())
break;
VariantContext var = progress.watch(r.next());
++nVariants;
/* insert ref allele */
this.alleleTable.insert(outputWriter, null, var.getReference().getBaseString());
/* insert variant */
this.variantTable.insert(outputWriter, null, vcffile_id, nVariants, chrom2chromId.get(var.getContig()), var.getStart(), (var.hasID() ? var.getID() : null), new SelectStmt(this.alleleTable, "bases", var.getReference().getBaseString()), (var.hasLog10PError() ? var.getPhredScaledQual() : null));
SelectStmt variant_id = new SelectStmt(variantTable);
/* insert alternate alleles */
for (Allele alt : var.getAlternateAlleles()) {
/* insert alt allele */
this.alleleTable.insert(outputWriter, null, alt.getBaseString());
this.variant2altTable.insert(outputWriter, null, variant_id, new SelectStmt(this.alleleTable, "bases", alt.getBaseString()));
}
/* insert filters */
for (final String filter : var.getFilters()) {
if (filter2filterid.get(filter) == null) {
throw new IOException("VCF Error: filter " + filter + " is not defined in the VCF header.");
}
this.variant2filters.insert(outputWriter, null, variant_id, filter2filterid.get(filter));
}
if (!this.ignore_info) {
for (final VepPrediction pred : vepPredictionParser.getPredictions(var)) {
/*
vepPrediction.insert(
outputWriter,
null,
variant_id,
pred.getEnsemblGene(),
pred.getEnsemblTranscript(),
pred.getEnsemblProtein(),
pred.getSymbol()
);
SelectStmt pred_id = new SelectStmt(vepPrediction);
for(SequenceOntologyTree.Term t: pred.getSOTerms())
{
String term=t.getAcn().replace(':', '_');
soTermTable.insert(
outputWriter,
null,
term,
t.getAcn()
);//for bioportal compatibility
SelectStmt term_id = new SelectStmt(soTermTable,"acn",term);
vepPrediction2so.insert(
outputWriter,
null,
pred_id,
term_id
);
}
*/
}
}
/* insert genotypes */
for (final String sampleName : sample2sampleid.keySet()) {
final Genotype g = var.getGenotype(sampleName);
if (!g.isAvailable() || g.isNoCall())
continue;
genotypeTable.insert(outputWriter, null, variant_id, sample2sampleid.get(sampleName), g.isCalled() ? new SelectStmt(this.alleleTable, "bases", g.getAllele(0).getBaseString()) : null, g.isCalled() ? new SelectStmt(this.alleleTable, "bases", g.getAllele(1).getBaseString()) : null, g.hasDP() ? g.getDP() : null, g.hasGQ() ? g.getGQ() : null);
}
}
r.close();
}
use of com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser.VepPrediction in project jvarkit by lindenb.
the class VCFComparePredictions method doWork.
@Override
public int doWork(List<String> args) {
PrintWriter out = null;
SortingCollection<LineAndFile> variants = null;
try {
if (args.isEmpty()) {
LOG.error("Illegal number of arguments");
return -1;
}
out = super.openFileOrStdoutAsPrintWriter(super.outputFile);
variants = SortingCollection.newInstance(LineAndFile.class, new AbstractVCFCompareBase.LineAndFileCodec(), new AbstractVCFCompareBase.LineAndFileComparator(), super.sortingCollectionArgs.getMaxRecordsInRam(), super.sortingCollectionArgs.getTmpPaths());
variants.setDestructiveIteration(true);
for (final String filename : args) {
LOG.info("Reading from " + filename);
Input input = super.put(variants, filename);
LOG.info("end reading " + input.filename);
}
List<PredictionTuple> predictionTuples = new ArrayList<PredictionTuple>(super.inputs.size());
for (AbstractVCFCompareBase.Input input : this.inputs) {
PredictionTuple predictionTuple = new PredictionTuple();
predictionTuple.snpEffPredictionParser = new SnpEffPredictionParserFactory(input.codecAndHeader.header).get();
predictionTuple.vepPredictionParser = new VepPredictionParserFactory(input.codecAndHeader.header).get();
predictionTuples.add(predictionTuple);
}
List<AbstractVCFCompareBase.LineAndFile> row = new ArrayList<LineAndFile>(super.inputs.size());
CloseableIterator<LineAndFile> iter = variants.iterator();
final Comparator<LineAndFile> posCompare = (A, B) -> A.getContigPosRef().compareTo(B.getContigPosRef());
for (; ; ) {
LineAndFile rec = null;
if (iter.hasNext()) {
rec = iter.next();
}
if (rec == null || (!row.isEmpty() && posCompare.compare(row.get(0), rec) != 0)) {
if (!row.isEmpty()) {
boolean printed = false;
VariantContext ctx = row.get(0).getContext();
if (row.size() != this.inputs.size()) {
startLine(out, ctx);
out.println("\tDiscordant number of variants");
printed = true;
}
for (int i = 0; i + 1 < row.size(); ++i) {
Input input1 = this.inputs.get(row.get(i).fileIdx);
VariantContext ctx1 = row.get(i).getContext();
PredictionTuple predtuple1 = predictionTuples.get(row.get(i).fileIdx);
List<VepPrediction> vepPredictions1 = predtuple1.vepPredictionParser.getPredictions(ctx1);
List<SnpEffPrediction> snpEffPredictions1 = predtuple1.snpEffPredictionParser.getPredictions(ctx1);
Set<SequenceOntologyTree.Term> so_vep_1 = getVepSoTerms(predtuple1.vepPredictionParser, ctx1);
Set<SequenceOntologyTree.Term> so_snpeff_1 = getSnpEffSoTerms(predtuple1.snpEffPredictionParser, ctx1);
for (int j = i + 1; j < row.size(); ++j) {
Input input2 = this.inputs.get(row.get(j).fileIdx);
VariantContext ctx2 = row.get(j).getContext();
PredictionTuple predtuple2 = predictionTuples.get(row.get(j).fileIdx);
List<VepPrediction> vepPredictions2 = predtuple2.vepPredictionParser.getPredictions(ctx2);
List<SnpEffPrediction> snpEffPredictions2 = predtuple2.snpEffPredictionParser.getPredictions(ctx2);
Set<SequenceOntologyTree.Term> so_vep_2 = getVepSoTerms(predtuple2.vepPredictionParser, ctx2);
Set<SequenceOntologyTree.Term> so_snpeff_2 = getSnpEffSoTerms(predtuple2.snpEffPredictionParser, ctx2);
if (vepPredictions1.size() != vepPredictions2.size()) {
startLine(out, ctx);
out.print("\tVEP discordant transcripts count");
out.print("\t" + input1.filename + ":" + vepPredictions1.size());
out.print("\t" + input2.filename + ":" + vepPredictions2.size());
out.println();
printed = true;
}
if (snpEffPredictions1.size() != snpEffPredictions2.size()) {
startLine(out, ctx);
out.print("\tSNPEFF discordant transcripts count");
out.print("\t" + input1.filename + ":" + snpEffPredictions1.size());
out.print("\t" + input2.filename + ":" + snpEffPredictions2.size());
out.println();
printed = true;
}
if (!unshared(so_vep_1, so_vep_2).isEmpty()) {
startLine(out, ctx);
out.print("\tVEP discordant SO:terms");
printDiscordantSO(out, input1, so_vep_1, input2, so_vep_2);
printed = true;
}
if (!unshared(so_snpeff_1, so_snpeff_2).isEmpty()) {
startLine(out, ctx);
out.print("\tSNPEFF discordant SO:terms");
printDiscordantSO(out, input1, so_snpeff_1, input2, so_snpeff_2);
printed = true;
}
}
}
if (!printed) {
startLine(out, ctx);
out.println("\tPASS");
}
row.clear();
}
if (rec == null)
break;
}
row.add(rec);
}
iter.close();
out.flush();
out.close();
out = null;
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
try {
if (variants != null)
variants.cleanup();
} catch (Exception err) {
}
}
}
use of com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser.VepPrediction in project jvarkit by lindenb.
the class VcfGeneSplitter method getVariantKeys.
private Set<String> getVariantKeys(final VepPredictionParser vepPredictionParser, final VariantContext ctx) {
final Set<String> keys = new HashSet<>();
for (final VepPrediction pred : vepPredictionParser.getPredictions(ctx)) {
String s = pred.getHGNC();
if (!isEmpty(s) && !this.ignoreHgnc) {
keys.add(String.format("HGNC_%s_%s", ctx.getContig(), s));
}
s = pred.getEnsemblGene();
if (!isEmpty(s) && !this.ignoreEnsg) {
keys.add(String.format("ENSG_%s_%s", ctx.getContig(), s));
}
/* same as feature
s= pred.getEnsemblTranscript();
if(!isEmpty(s)) {
keys.add(String.format("ENST_%s_%s",ctx.getContig(),s));
}*/
s = pred.getFeature();
if (!isEmpty(s) && !this.ignoreFeature) {
keys.add(String.format("FEATURE_%s_%s", ctx.getContig(), s));
if ((s.startsWith("XM_") || s.startsWith("NM_")) && !this.ignoreRefSeq) {
keys.add(String.format("REFSEQ_%s_%s", ctx.getContig(), s));
} else if (s.startsWith("ENST_") && !this.ignoreEnst) {
keys.add(String.format("ENST_%s_%s", ctx.getContig(), s));
}
}
s = pred.getSymbol();
if (!isEmpty(s) && !this.ignoreSymbol) {
keys.add(String.format("SYMBOL_%s_%s", ctx.getContig(), s));
}
s = pred.getENSP();
if (!isEmpty(s) && !this.ignoreENSP) {
keys.add(String.format("ENSP_%s_%s", ctx.getContig(), s));
}
}
return keys;
}
use of com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser.VepPrediction in project jvarkit by lindenb.
the class VcfToRdf method scanVCF.
private void scanVCF(final File filein) throws IOException {
VcfIterator in = null;
URI source = null;
try {
if (filein != null)
source = filein.toURI();
in = (filein == null ? VCFUtils.createVcfIteratorStdin() : VCFUtils.createVcfIteratorFromFile(filein));
final VCFHeader header = in.getHeader();
final VepPredictionParser vepPredictionParser = new VepPredictionParserFactory(header).get();
writeHeader(header, source);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
while (in.hasNext()) {
if (this.w.checkError()) {
LOG.warn("I/O interruption");
break;
}
final VariantContext ctx = progress.watch(in.next());
/* Variant */
final URI variant = URI.create("urn:variant/" + ctx.getContig() + ":" + ctx.getStart() + ":" + ctx.getReference().getBaseString());
emit(variant, "rdf:type", "vcf:Variant", "vcf:chrom", URI.create("urn:chrom/" + ctx.getContig()), "vcf:position", ctx.getStart(), "vcf:ref", ctx.getReference().getBaseString(), "vcf:id", (ctx.hasID() ? ctx.getID() : null), "vcf:qual", (ctx.hasLog10PError() ? ctx.getPhredScaledQual() : null));
if (this.printAlleles) {
for (final Allele alt : ctx.getAlternateAlleles()) {
emit(variant, "vcf:alt", alt.getBaseString());
}
}
if (this.printFilters) {
for (final String f : ctx.getFilters()) {
emit(variant, "vcf:filter", URI.create("urn:filter/" + f));
}
}
if (this.printVep) {
for (final VepPrediction prediction : vepPredictionParser.getPredictions(ctx)) {
/*
final List<Object> L=new ArrayList<>();
L.add("rdf:type");L.add("vep:Prediction");
L.add("vcf:variant"); L.add(variant);
L.add("vcf:allele");L.add(prediction.getAllele().getBaseString());
for(final SequenceOntologyTree.Term term:prediction.getSOTerms())
{
L.add("vcf:so");
L.add(URI.create(term.getUri()));
}
if(prediction.getEnsemblTranscript()!=null)
{
final URI transcriptid=URI.create("http://www.ensembl.org/id/"+prediction.getEnsemblTranscript());
L.add("vep:transcript");
L.add(transcriptid);
if(prediction.getEnsemblGene()!=null)
{
emit(transcriptid,
"uniprot:transcribedFrom",//used in uniprot dump
URI.create("http://www.ensembl.org/id/"+prediction.getEnsemblGene())
);
}
if(prediction.getEnsemblProtein()!=null)
{
emit(
transcriptid,
"uniprot:translatedTo",//used in uniprot dump
URI.create("http://www.ensembl.org/id/"+prediction.getEnsemblProtein())
);
}
}
emit(
URI.create("urn:vep/"+(++id_generator)),
L.toArray()
);
*/
}
}
if (this.printGenotypes) {
for (final String sample : ctx.getSampleNames()) {
final Genotype g = ctx.getGenotype(sample);
final List<Object> L = new ArrayList<>();
L.add("vcf:sample");
L.add(URI.create("urn:sample/" + sample));
L.add("vcf:variant");
L.add(variant);
L.add("rdf:type");
L.add("vcf:Genotype");
if (g.hasDP()) {
L.add("vcf:dp");
L.add(g.getDP());
}
if (g.hasGQ()) {
L.add("vcf:gq");
L.add(g.getGQ());
}
if (g.isCalled()) {
if (g.isHet()) {
if (g.isHetNonRef()) {
L.add("rdf:type");
L.add("vcf:HetNonRefGenotype");
} else {
L.add("rdf:type");
L.add("vcf:HetGenotype");
}
} else if (g.isHom()) {
if (g.isHomRef()) {
L.add("rdf:type");
L.add("vcf:HomRefGenotype");
} else {
L.add("rdf:type");
L.add("vcf:HomVarGenotype");
}
}
for (final Allele a : g.getAlleles()) {
L.add("vcf:allele");
L.add(a.getBaseString());
}
}
emit(URI.create("urn:gt/" + ctx.getContig() + ":" + ctx.getStart() + ":" + ctx.getReference().getBaseString() + ":" + sample), L.toArray());
}
}
}
in.close();
in = null;
progress.finish();
} catch (final Exception e) {
throw new IOException(e);
} finally {
CloserUtil.close(in);
}
}
use of com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser.VepPrediction in project jvarkit by lindenb.
the class VcfGeneOntology method filterVcfIterator.
private void filterVcfIterator(final VcfIterator in) throws IOException {
VariantContextWriter w = null;
try {
VCFHeader header = in.getHeader();
VCFHeader h2 = new VCFHeader(header);
h2.addMetaDataLine(new VCFInfoHeaderLine(TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "GO terms from GO " + GO + " and GOA=" + GOA));
h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkVersion", HtsjdkVersion.getVersion()));
h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkHome", HtsjdkVersion.getHome()));
if (filterName != null) {
h2.addMetaDataLine(new VCFFilterHeaderLine(filterName, "Flag GO terms " + (inverse_filter ? " not descendant of " : "") + " the provided GO terms"));
}
w = super.openVariantContextWriter(outputFile);
w.writeHeader(h2);
final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(header.getSequenceDictionary());
final SnpEffPredictionParser snpEffPredictionParser = new SnpEffPredictionParserFactory().header(header).get();
final VepPredictionParser vepPredictionParser = new VepPredictionParserFactory().header(header).get();
while (in.hasNext()) {
if (System.out.checkError())
break;
VariantContext ctx = progess.watch(in.next());
/* symbols for this variant */
Set<String> symbols = new HashSet<String>();
/* scan SNPEFF gene */
for (SnpEffPrediction pred : snpEffPredictionParser.getPredictions(ctx)) {
String genName = pred.getGeneName();
if (genName == null || genName.isEmpty())
continue;
symbols.add(genName);
}
/* scan VEP gene */
for (VepPrediction pred : vepPredictionParser.getPredictions(ctx)) {
String genName = pred.getGeneName();
if (!(genName == null || genName.isEmpty())) {
symbols.add(genName);
}
genName = pred.getGene();
if (!(genName == null || genName.isEmpty())) {
symbols.add(genName);
}
genName = pred.getHGNC();
if (!(genName == null || genName.isEmpty())) {
symbols.add(genName);
}
}
/* only keep known GENES from GOA */
symbols.retainAll(this.name2go.keySet());
boolean found_child_of_filter = false;
/* ATTS */
List<String> atts = new ArrayList<String>();
/* loop over symbols */
for (String symbol : symbols) {
/* go terms associated to this symbol */
Set<GoTree.Term> t2 = this.name2go.get(symbol);
if (t2 == null || t2.isEmpty())
continue;
StringBuilder sb = new StringBuilder(symbol);
sb.append("|");
boolean first = true;
for (GoTree.Term gt : t2) {
/* user gave terms to filter */
if (!found_child_of_filter && this.goTermToFilter != null) {
for (GoTree.Term userTerm : this.goTermToFilter) {
if (userTerm.hasDescendant(gt.getAcn())) {
found_child_of_filter = true;
break;
}
}
}
if (!first)
sb.append("&");
sb.append(gt.getAcn());
first = false;
}
atts.add(sb.toString());
}
/* no go term was found */
if (atts.isEmpty()) {
if (!removeIfNoGo) {
w.add(ctx);
}
continue;
}
VariantContextBuilder vcb = new VariantContextBuilder(ctx);
/* check children of user's terms */
if (this.goTermToFilter != null) {
/* keep if found children*/
if ((this.inverse_filter && found_child_of_filter) || (!this.inverse_filter && !found_child_of_filter)) {
/* don't remove, but set filter */
if (this.filterName != null) {
Set<String> filters = new HashSet<String>(ctx.getFilters());
filters.add(this.filterName);
vcb.filters(filters);
} else {
continue;
}
}
}
/* add go terms */
vcb.attribute(this.TAG, atts);
w.add(vcb.make());
}
progess.finish();
w.close();
w = null;
} finally {
CloserUtil.close(w);
w = null;
}
}
Aggregations