use of com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory in project jvarkit by lindenb.
the class VcfBurdenFilterGenes method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VcfIterator in, final VariantContextWriter out) {
final VCFHeader header = in.getHeader();
try {
final VCFHeader h2 = addMetaData(new VCFHeader(header));
final VCFFilterHeaderLine filterControlsHeader;
if (!StringUtil.isBlank(this.filterTag)) {
filterControlsHeader = new VCFFilterHeaderLine(this.filterTag.trim(), "Genes not in list " + this.geneFile);
h2.addMetaDataLine(filterControlsHeader);
} else {
filterControlsHeader = null;
}
final List<String> lookColumns = Arrays.asList("CCDS", "Feature", "ENSP", "Gene", "HGNC", "HGNC_ID", "SYMBOL", "RefSeq");
final VepPredictionParser vepParser = new VepPredictionParserFactory(header).get();
final AnnPredictionParser annParser = new AnnPredictionParserFactory(header).get();
final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(header.getSequenceDictionary()).logger(LOG);
out.writeHeader(h2);
while (in.hasNext() && !out.checkError()) {
final VariantContext ctx = progess.watch(in.next());
boolean keep = false;
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
// not just set FILTER ?
if (filterControlsHeader == null) {
vcb.rmAttribute(vepParser.getTag());
vcb.rmAttribute(annParser.getTag());
}
final List<String> newVepList = new ArrayList<>();
for (final String predStr : ctx.getAttributeAsList(vepParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
final VepPredictionParser.VepPrediction pred = vepParser.parseOnePrediction(ctx, predStr);
for (final String col : lookColumns) {
final String token = pred.getByCol(col);
if (!StringUtil.isBlank(token) && this.geneNames.contains(token)) {
newVepList.add(predStr);
keep = true;
break;
}
}
}
final List<String> newEffList = new ArrayList<>();
for (final String predStr : ctx.getAttributeAsList(annParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
final AnnPredictionParser.AnnPrediction pred = annParser.parseOnePrediction(predStr);
final String token = pred.getGeneName();
if (!StringUtil.isBlank(token) && this.geneNames.contains(token)) {
newEffList.add(predStr);
keep = true;
break;
}
}
// not just set FILTER ?
if (filterControlsHeader == null) {
if (!newVepList.isEmpty())
vcb.attribute(vepParser.getTag(), newVepList);
if (!newEffList.isEmpty())
vcb.attribute(annParser.getTag(), newEffList);
}
if (filterControlsHeader != null) {
if (!keep) {
vcb.filter(filterControlsHeader.getID());
} else if (!ctx.isFiltered()) {
vcb.passFilters();
}
out.add(vcb.make());
} else {
if (keep)
out.add(vcb.make());
}
}
progess.finish();
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(in);
}
}
use of com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory in project jvarkit by lindenb.
the class VcfToSql method read.
private void read(File filename) throws IOException {
/* insert ATGC */
this.alleleTable.insert(outputWriter, null, "A");
this.alleleTable.insert(outputWriter, null, "C");
this.alleleTable.insert(outputWriter, null, "G");
this.alleleTable.insert(outputWriter, null, "T");
/* insert this sample */
this.vcfFileTable.insert(outputWriter, null, filename);
final SelectStmt vcffile_id = new SelectStmt(this.vcfFileTable);
final Map<String, SelectStmt> sample2sampleid = new HashMap<String, SelectStmt>();
final Map<String, SelectStmt> filter2filterid = new HashMap<String, SelectStmt>();
final Map<String, SelectStmt> chrom2chromId = new HashMap<String, SelectStmt>();
final VcfIterator r = VCFUtils.createVcfIteratorFromFile(filename);
final VCFHeader header = r.getHeader();
/* parse samples */
for (final String sampleName : header.getSampleNamesInOrder()) {
this.sampleTable.insert(outputWriter, null, sampleName);
SelectStmt sample_id = new SelectStmt(this.sampleTable, "name", sampleName);
sample2sampleid.put(sampleName, sample_id);
this.sample2fileTable.insert(outputWriter, null, vcffile_id, sample_id);
}
/* parse filters */
for (final VCFFilterHeaderLine filter : header.getFilterLines()) {
this.filterTable.insert(outputWriter, null, vcffile_id, filter.getID(), filter.getValue());
filter2filterid.put(filter.getID(), new SelectStmt(this.filterTable, "name", filter.getID()));
}
filter2filterid.put(VCFConstants.PASSES_FILTERS_v4, new SelectStmt(this.filterTable, "name", VCFConstants.PASSES_FILTERS_v4));
final SAMSequenceDictionary dict = header.getSequenceDictionary();
if (dict == null) {
throw new RuntimeException("dictionary missing in VCF");
}
/* parse sequence dict */
for (final SAMSequenceRecord ssr : dict.getSequences()) {
this.chromosomeTable.insert(outputWriter, null, vcffile_id, ssr.getSequenceName(), ssr.getSequenceLength());
chrom2chromId.put(ssr.getSequenceName(), new SelectStmt(this.chromosomeTable, "name", ssr.getSequenceName()));
}
VepPredictionParser vepPredictionParser = new VepPredictionParserFactory(header).get();
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
int nVariants = 0;
while (r.hasNext()) {
if (this.outputWriter.checkError())
break;
VariantContext var = progress.watch(r.next());
++nVariants;
/* insert ref allele */
this.alleleTable.insert(outputWriter, null, var.getReference().getBaseString());
/* insert variant */
this.variantTable.insert(outputWriter, null, vcffile_id, nVariants, chrom2chromId.get(var.getContig()), var.getStart(), (var.hasID() ? var.getID() : null), new SelectStmt(this.alleleTable, "bases", var.getReference().getBaseString()), (var.hasLog10PError() ? var.getPhredScaledQual() : null));
SelectStmt variant_id = new SelectStmt(variantTable);
/* insert alternate alleles */
for (Allele alt : var.getAlternateAlleles()) {
/* insert alt allele */
this.alleleTable.insert(outputWriter, null, alt.getBaseString());
this.variant2altTable.insert(outputWriter, null, variant_id, new SelectStmt(this.alleleTable, "bases", alt.getBaseString()));
}
/* insert filters */
for (final String filter : var.getFilters()) {
if (filter2filterid.get(filter) == null) {
throw new IOException("VCF Error: filter " + filter + " is not defined in the VCF header.");
}
this.variant2filters.insert(outputWriter, null, variant_id, filter2filterid.get(filter));
}
if (!this.ignore_info) {
for (final VepPrediction pred : vepPredictionParser.getPredictions(var)) {
/*
vepPrediction.insert(
outputWriter,
null,
variant_id,
pred.getEnsemblGene(),
pred.getEnsemblTranscript(),
pred.getEnsemblProtein(),
pred.getSymbol()
);
SelectStmt pred_id = new SelectStmt(vepPrediction);
for(SequenceOntologyTree.Term t: pred.getSOTerms())
{
String term=t.getAcn().replace(':', '_');
soTermTable.insert(
outputWriter,
null,
term,
t.getAcn()
);//for bioportal compatibility
SelectStmt term_id = new SelectStmt(soTermTable,"acn",term);
vepPrediction2so.insert(
outputWriter,
null,
pred_id,
term_id
);
}
*/
}
}
/* insert genotypes */
for (final String sampleName : sample2sampleid.keySet()) {
final Genotype g = var.getGenotype(sampleName);
if (!g.isAvailable() || g.isNoCall())
continue;
genotypeTable.insert(outputWriter, null, variant_id, sample2sampleid.get(sampleName), g.isCalled() ? new SelectStmt(this.alleleTable, "bases", g.getAllele(0).getBaseString()) : null, g.isCalled() ? new SelectStmt(this.alleleTable, "bases", g.getAllele(1).getBaseString()) : null, g.hasDP() ? g.getDP() : null, g.hasGQ() ? g.getGQ() : null);
}
}
r.close();
}
use of com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory in project jvarkit by lindenb.
the class VCFComparePredictions method doWork.
@Override
public int doWork(List<String> args) {
PrintWriter out = null;
SortingCollection<LineAndFile> variants = null;
try {
if (args.isEmpty()) {
LOG.error("Illegal number of arguments");
return -1;
}
out = super.openFileOrStdoutAsPrintWriter(super.outputFile);
variants = SortingCollection.newInstance(LineAndFile.class, new AbstractVCFCompareBase.LineAndFileCodec(), new AbstractVCFCompareBase.LineAndFileComparator(), super.sortingCollectionArgs.getMaxRecordsInRam(), super.sortingCollectionArgs.getTmpPaths());
variants.setDestructiveIteration(true);
for (final String filename : args) {
LOG.info("Reading from " + filename);
Input input = super.put(variants, filename);
LOG.info("end reading " + input.filename);
}
List<PredictionTuple> predictionTuples = new ArrayList<PredictionTuple>(super.inputs.size());
for (AbstractVCFCompareBase.Input input : this.inputs) {
PredictionTuple predictionTuple = new PredictionTuple();
predictionTuple.snpEffPredictionParser = new SnpEffPredictionParserFactory(input.codecAndHeader.header).get();
predictionTuple.vepPredictionParser = new VepPredictionParserFactory(input.codecAndHeader.header).get();
predictionTuples.add(predictionTuple);
}
List<AbstractVCFCompareBase.LineAndFile> row = new ArrayList<LineAndFile>(super.inputs.size());
CloseableIterator<LineAndFile> iter = variants.iterator();
final Comparator<LineAndFile> posCompare = (A, B) -> A.getContigPosRef().compareTo(B.getContigPosRef());
for (; ; ) {
LineAndFile rec = null;
if (iter.hasNext()) {
rec = iter.next();
}
if (rec == null || (!row.isEmpty() && posCompare.compare(row.get(0), rec) != 0)) {
if (!row.isEmpty()) {
boolean printed = false;
VariantContext ctx = row.get(0).getContext();
if (row.size() != this.inputs.size()) {
startLine(out, ctx);
out.println("\tDiscordant number of variants");
printed = true;
}
for (int i = 0; i + 1 < row.size(); ++i) {
Input input1 = this.inputs.get(row.get(i).fileIdx);
VariantContext ctx1 = row.get(i).getContext();
PredictionTuple predtuple1 = predictionTuples.get(row.get(i).fileIdx);
List<VepPrediction> vepPredictions1 = predtuple1.vepPredictionParser.getPredictions(ctx1);
List<SnpEffPrediction> snpEffPredictions1 = predtuple1.snpEffPredictionParser.getPredictions(ctx1);
Set<SequenceOntologyTree.Term> so_vep_1 = getVepSoTerms(predtuple1.vepPredictionParser, ctx1);
Set<SequenceOntologyTree.Term> so_snpeff_1 = getSnpEffSoTerms(predtuple1.snpEffPredictionParser, ctx1);
for (int j = i + 1; j < row.size(); ++j) {
Input input2 = this.inputs.get(row.get(j).fileIdx);
VariantContext ctx2 = row.get(j).getContext();
PredictionTuple predtuple2 = predictionTuples.get(row.get(j).fileIdx);
List<VepPrediction> vepPredictions2 = predtuple2.vepPredictionParser.getPredictions(ctx2);
List<SnpEffPrediction> snpEffPredictions2 = predtuple2.snpEffPredictionParser.getPredictions(ctx2);
Set<SequenceOntologyTree.Term> so_vep_2 = getVepSoTerms(predtuple2.vepPredictionParser, ctx2);
Set<SequenceOntologyTree.Term> so_snpeff_2 = getSnpEffSoTerms(predtuple2.snpEffPredictionParser, ctx2);
if (vepPredictions1.size() != vepPredictions2.size()) {
startLine(out, ctx);
out.print("\tVEP discordant transcripts count");
out.print("\t" + input1.filename + ":" + vepPredictions1.size());
out.print("\t" + input2.filename + ":" + vepPredictions2.size());
out.println();
printed = true;
}
if (snpEffPredictions1.size() != snpEffPredictions2.size()) {
startLine(out, ctx);
out.print("\tSNPEFF discordant transcripts count");
out.print("\t" + input1.filename + ":" + snpEffPredictions1.size());
out.print("\t" + input2.filename + ":" + snpEffPredictions2.size());
out.println();
printed = true;
}
if (!unshared(so_vep_1, so_vep_2).isEmpty()) {
startLine(out, ctx);
out.print("\tVEP discordant SO:terms");
printDiscordantSO(out, input1, so_vep_1, input2, so_vep_2);
printed = true;
}
if (!unshared(so_snpeff_1, so_snpeff_2).isEmpty()) {
startLine(out, ctx);
out.print("\tSNPEFF discordant SO:terms");
printDiscordantSO(out, input1, so_snpeff_1, input2, so_snpeff_2);
printed = true;
}
}
}
if (!printed) {
startLine(out, ctx);
out.println("\tPASS");
}
row.clear();
}
if (rec == null)
break;
}
row.add(rec);
}
iter.close();
out.flush();
out.close();
out = null;
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
try {
if (variants != null)
variants.cleanup();
} catch (Exception err) {
}
}
}
use of com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory in project jvarkit by lindenb.
the class VcfTools method init.
public void init(final VCFHeader header) {
this.header = header;
this.snpEffPredictionParser = new SnpEffPredictionParserFactory().header(header).get();
this.vepPredictionParser = new VepPredictionParserFactory().header(header).get();
this.annPredictionParser = new AnnPredictionParserFactory(header).get();
}
use of com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory in project jvarkit by lindenb.
the class VcfBurden method doWork2.
private int doWork2(List<String> args) {
ZipOutputStream zout = null;
FileOutputStream fout = null;
VcfIterator in = null;
try {
if (args.isEmpty()) {
LOG.info("reading from stdin.");
in = VCFUtils.createVcfIteratorStdin();
} else if (args.size() == 1) {
String filename = args.get(0);
LOG.info("reading from " + filename);
in = VCFUtils.createVcfIterator(filename);
} else {
LOG.error("Illegal number of arguments.");
return -1;
}
if (outputFile == null) {
LOG.error("undefined output");
return -1;
} else if (!outputFile.getName().endsWith(".zip")) {
LOG.error("output " + outputFile + " should end with .zip");
return -1;
} else {
fout = new FileOutputStream(outputFile);
zout = new ZipOutputStream(fout);
}
final List<String> samples = in.getHeader().getSampleNamesInOrder();
final VCFHeader header = in.getHeader();
String prev_chrom = null;
final VepPredictionParser vepPredParser = new VepPredictionParserFactory().header(header).get();
final Map<GeneTranscript, List<VariantAndCsq>> gene2variants = new HashMap<>();
final SequenceOntologyTree soTree = SequenceOntologyTree.getInstance();
final Set<SequenceOntologyTree.Term> acn = new HashSet<>();
/* mail solena *SO en remplacement des SO actuels (VEP HIGH + MODERATE) - pas la peine de faire retourner les analyses mais servira pour les futures analyses burden */
String[] acn_list = new String[] { "SO:0001893", "SO:0001574", "SO:0001575", "SO:0001587", "SO:0001589", "SO:0001578", "SO:0002012", "SO:0001889", "SO:0001821", "SO:0001822", "SO:0001583", "SO:0001818" /*
"SO:0001589", "SO:0001587", "SO:0001582", "SO:0001583",
"SO:0001575", "SO:0001578", "SO:0001574", "SO:0001889",
"SO:0001821", "SO:0001822", "SO:0001893"*/
};
if (this.highdamage) {
acn_list = new String[] { "SO:0001893", "SO:0001574", "SO:0001575", "SO:0001587", "SO:0001589", "SO:0001578", "SO:0002012", "SO:0001889" };
}
for (final String acns : acn_list) {
final SequenceOntologyTree.Term tacn = soTree.getTermByAcn(acns);
if (tacn == null) {
in.close();
throw new NullPointerException("tacn == null pour " + acns);
}
acn.addAll(tacn.getAllDescendants());
}
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(in.getHeader());
for (; ; ) {
VariantContext ctx1 = null;
if (in.hasNext()) {
ctx1 = progress.watch(in.next());
if (ctx1.getAlternateAlleles().size() != 1) {
// info("count(ALT)!=1 in "+ctx1.getChr()+":"+ctx1.getStart());
continue;
}
if (ctx1.getAlternateAlleles().get(0).equals(Allele.SPAN_DEL)) {
continue;
}
}
if (ctx1 == null || !ctx1.getContig().equals(prev_chrom)) {
LOG.info("DUMP to zip n=" + gene2variants.size());
final Set<String> geneNames = new HashSet<>();
for (GeneTranscript gene_transcript : gene2variants.keySet()) {
geneNames.add(gene_transcript.geneName);
final Set<VariantAndCsq> uniq = new TreeSet<>(this.variantAndCsqComparator);
uniq.addAll(gene2variants.get(gene_transcript));
dumpVariants(zout, prev_chrom, gene_transcript.geneName + "_" + gene_transcript.transcriptName, samples, new ArrayList<VariantAndCsq>(uniq));
LOG.info("dumped" + gene_transcript.geneName);
}
LOG.info("loop over geneName");
for (final String geneName : geneNames) {
final SortedSet<VariantAndCsq> lis_nm = new TreeSet<>(this.variantAndCsqComparator);
final SortedSet<VariantAndCsq> lis_all = new TreeSet<>(this.variantAndCsqComparator);
final SortedSet<VariantAndCsq> lis_refseq = new TreeSet<>(this.variantAndCsqComparator);
final SortedSet<VariantAndCsq> lis_enst = new TreeSet<>(this.variantAndCsqComparator);
LOG.info("loop over gene2variants");
for (final GeneTranscript gene_transcript : gene2variants.keySet()) {
if (!geneName.equals(gene_transcript.geneName))
continue;
lis_all.addAll(gene2variants.get(gene_transcript));
if (gene_transcript.transcriptName.startsWith("NM_")) {
lis_nm.addAll(gene2variants.get(gene_transcript));
}
if (!gene_transcript.transcriptName.startsWith("ENST")) {
lis_refseq.addAll(gene2variants.get(gene_transcript));
}
if (gene_transcript.transcriptName.startsWith("ENST")) {
lis_enst.addAll(gene2variants.get(gene_transcript));
}
}
LOG.info("dump_ALL_TRANSCRIPTS");
dumpVariants(zout, prev_chrom, geneName + "_ALL_TRANSCRIPTS", samples, new ArrayList<VariantAndCsq>(lis_all));
LOG.info("dump_ALL_NM");
dumpVariants(zout, prev_chrom, geneName + "_ALL_NM", samples, new ArrayList<VariantAndCsq>(lis_nm));
LOG.info("dump_ALL_REFSEQ");
dumpVariants(zout, prev_chrom, geneName + "_ALL_REFSEQ", samples, new ArrayList<VariantAndCsq>(lis_refseq));
LOG.info("dump_ALL_ENST");
dumpVariants(zout, prev_chrom, geneName + "_ALL_ENST", samples, new ArrayList<VariantAndCsq>(lis_enst));
}
if (ctx1 == null)
break;
LOG.info("gene2variants");
gene2variants.clear();
LOG.info("System.gc();");
System.gc();
prev_chrom = ctx1.getContig();
LOG.info("prev_chrom=" + prev_chrom);
}
final Set<GeneTranscript> seen_names = new HashSet<>();
for (final VepPredictionParser.VepPrediction pred : vepPredParser.getPredictions(ctx1)) {
String geneName = pred.getSymbol();
if (geneName == null || geneName.trim().isEmpty())
continue;
if (this._gene2seen != null) {
if (!this._gene2seen.containsKey(geneName))
continue;
}
final String transcriptName = pred.getFeature();
if (transcriptName == null || transcriptName.trim().isEmpty()) {
LOG.info("No transcript in " + ctx1);
continue;
}
final GeneTranscript geneTranscript = new GeneTranscript(geneName, transcriptName);
if (seen_names.contains(geneTranscript))
continue;
boolean ok = false;
for (SequenceOntologyTree.Term so : pred.getSOTerms()) {
if (acn.contains(so)) {
ok = true;
}
}
if (!printSOTerms && !ok)
continue;
List<VariantAndCsq> L = gene2variants.get(geneTranscript);
if (L == null) {
L = new ArrayList<>();
gene2variants.put(geneTranscript, L);
}
Float vqslod = null;
if (this.printVQSLOD && ctx1.hasAttribute("VQSLOD")) {
vqslod = (float) ctx1.getAttributeAsDouble("VQSLOD", -9999999.0);
}
L.add(new VariantAndCsq(ctx1, pred.getSOTerms(), pred.getPositionInCDS(), vqslod));
seen_names.add(geneTranscript);
if (this._gene2seen != null) {
this._gene2seen.put(geneTranscript.geneName, Boolean.TRUE);
}
}
}
if (this._gene2seen != null) {
final List<VariantAndCsq> emptylist = Collections.emptyList();
for (final String gene : this._gene2seen.keySet()) {
if (this._gene2seen.get(gene).equals(Boolean.TRUE))
continue;
LOG.warning("Gene not found : " + gene);
dumpVariants(zout, "UNDEFINED", gene + "_000000000000000.txt", samples, emptylist);
}
}
progress.finish();
zout.finish();
fout.flush();
zout.flush();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(in);
CloserUtil.close(zout);
CloserUtil.close(fout);
}
}
Aggregations