Search in sources :

Example 6 with VepPredictionParser

use of com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser in project jvarkit by lindenb.

the class VcfGeneSplitter method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, File outputFile) {
    SortingCollection<KeyAndLine> sortingcollection = null;
    BufferedReader in = null;
    FileOutputStream fos = null;
    ZipOutputStream zout = null;
    CloseableIterator<KeyAndLine> iter = null;
    PrintWriter pw = null;
    try {
        in = inputName == null ? IOUtils.openStreamForBufferedReader(stdin()) : IOUtils.openURIForBufferedReading(inputName);
        final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(in);
        /**
         * find splitter by name
         */
        final VepPredictionParser vepPredictionParser = new VepPredictionParserFactory().header(cah.header).get();
        sortingcollection = SortingCollection.newInstance(KeyAndLine.class, new KeyAndLineCodec(), new KeyAndLineComparator(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        sortingcollection.setDestructiveIteration(true);
        // read variants
        final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(cah.header);
        String line;
        while ((line = in.readLine()) != null) {
            final VariantContext ctx = progess.watch(cah.codec.decode(line));
            // no check for ctx.ifFiltered here, we do this later.
            for (final String key : this.getVariantKeys(vepPredictionParser, ctx)) {
                sortingcollection.add(new KeyAndLine(key, line));
            }
        }
        progess.finish();
        sortingcollection.doneAdding();
        LOG.info("creating zip " + outputFile);
        fos = new FileOutputStream(outputFile);
        zout = new ZipOutputStream(fos);
        final File tmpReportFile = File.createTempFile("_tmp.", ".txt", writingSortingCollection.getTmpDirectories().get(0));
        tmpReportFile.deleteOnExit();
        pw = IOUtils.openFileForPrintWriter(tmpReportFile);
        pw.println("#chrom\tstart\tend\tkey\tCount_Variants");
        iter = sortingcollection.iterator();
        final EqualRangeIterator<KeyAndLine> eqiter = new EqualRangeIterator<>(iter, new Comparator<KeyAndLine>() {

            @Override
            public int compare(final KeyAndLine o1, final KeyAndLine o2) {
                return o1.key.compareTo(o2.key);
            }
        });
        while (eqiter.hasNext()) {
            final List<KeyAndLine> buffer = eqiter.next();
            final KeyAndLine first = buffer.get(0);
            LOG.info(first.key);
            final List<VariantContext> variants = new ArrayList<>(buffer.size());
            String contig = null;
            int chromStart = Integer.MAX_VALUE;
            int chromEnd = 0;
            for (final KeyAndLine kal : buffer) {
                final VariantContext ctx = cah.codec.decode(kal.ctx);
                variants.add(ctx);
                contig = ctx.getContig();
                chromStart = Math.min(chromStart, ctx.getStart());
                chromEnd = Math.max(chromEnd, ctx.getEnd());
            }
            pw.println(contig + "\t" + (chromStart - 1) + // -1 for bed compatibility
            "\t" + chromEnd + "\t" + first.key + "\t" + variants.size());
            // save vcf file
            final ZipEntry ze = new ZipEntry(this.baseZipDir + "/" + first.key + ".vcf");
            zout.putNextEntry(ze);
            final VariantContextWriter out = VCFUtils.createVariantContextWriterToOutputStream(IOUtils.uncloseableOutputStream(zout));
            final VCFHeader header2 = addMetaData(new VCFHeader(cah.header));
            header2.addMetaDataLine(new VCFHeaderLine("VcfGeneSplitter.Name", String.valueOf(first.key)));
            out.writeHeader(header2);
            for (final VariantContext ctx : variants) {
                out.add(ctx);
            }
            // yes because wrapped into IOUtils.encloseableOutputSream
            out.close();
            zout.closeEntry();
        }
        eqiter.close();
        iter.close();
        iter = null;
        progess.finish();
        LOG.info("saving report");
        pw.flush();
        pw.close();
        final ZipEntry entry = new ZipEntry(this.baseZipDir + "/manifest.bed");
        zout.putNextEntry(entry);
        IOUtils.copyTo(tmpReportFile, zout);
        zout.closeEntry();
        zout.finish();
        zout.close();
        return RETURN_OK;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        if (sortingcollection != null)
            sortingcollection.cleanup();
        CloserUtil.close(in);
        CloserUtil.close(fos);
        CloserUtil.close(pw);
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) ZipEntry(java.util.zip.ZipEntry) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) PrintWriter(java.io.PrintWriter) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) IOException(java.io.IOException) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) ZipOutputStream(java.util.zip.ZipOutputStream) FileOutputStream(java.io.FileOutputStream) BufferedReader(java.io.BufferedReader) File(java.io.File) VepPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory)

Example 7 with VepPredictionParser

use of com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser in project jvarkit by lindenb.

the class VCFComposite method doWork.

@Override
public int doWork(final List<String> args) {
    PrintWriter out = null;
    try {
        out = super.openFileOrStdoutAsPrintWriter(this.outputFile);
        if (listModels) {
            for (final Type t : Type.values()) {
                out.println(t.name());
                out.println("\t" + t.getDescription());
            }
            out.flush();
            return 0;
        }
        this.pedigree = Pedigree.newParser().parse(pedigreeFile);
        if (this.pedigree.getAffected().isEmpty()) {
            LOG.error("No Affected sample in " + this.pedigreeFile);
            return -1;
        }
        if (this.pedigree.getUnaffected().isEmpty()) {
            LOG.error("No Unaffected sample in " + this.pedigreeFile);
            return -1;
        }
        final DiseaseModel model = this.createModel();
        final String inputName = super.oneFileOrNull(args);
        final LineIterator r = (inputName == null ? IOUtils.openStreamForLineIterator(stdin()) : IOUtils.openURIForLineIterator(inputName));
        final VCFCodec codec = new VCFCodec();
        final VCFHeader header = (VCFHeader) codec.readActualHeader(r);
        final AnnPredictionParser annParser = new AnnPredictionParserFactory(header).get();
        final VepPredictionParser vepParser = new VepPredictionParserFactory(header).get();
        // final VCFHeader h2=new VCFHeader(header.getMetaDataInInputOrder(),header.getSampleNamesInOrder());
        // h2.addMetaDataLine(new VCFInfoHeaderLine(this.TAG,1,VCFHeaderLineType.String,"Values from bigwig file: "+BIGWIG));
        SortingCollection<GeneAndVariant> sorting = null;
        String prevContig = null;
        for (; ; ) {
            String line;
            final VariantContext ctx;
            if (r.hasNext()) {
                line = r.next();
                ctx = codec.decode(line);
            } else {
                line = null;
                ctx = null;
            }
            if (ctx == null || !ctx.getContig().equals(prevContig)) {
                if (sorting != null) {
                    LOG.debug("Dump contig " + prevContig);
                    sorting.doneAdding();
                    CloseableIterator<GeneAndVariant> iter2 = sorting.iterator();
                    EqualRangeIterator<GeneAndVariant> eqiter = new EqualRangeIterator<>(iter2, (A, B) -> A.gene.compareTo(B.gene));
                    while (eqiter.hasNext()) {
                        final List<GeneAndVariant> variants = eqiter.next();
                        model.scan(variants.get(0).gene, variants.stream().map(L -> codec.decode(L.ctxLine)).collect(Collectors.toList()), out);
                    }
                    eqiter.close();
                    iter2.close();
                    sorting.cleanup();
                }
                sorting = null;
                if (ctx == null)
                    break;
                prevContig = ctx.getContig();
            }
            if (!ctx.isVariant())
                continue;
            if (!acceptFiltered && ctx.isFiltered())
                continue;
            if (!acceptID && ctx.hasID())
                continue;
            if (!model.accept(ctx))
                continue;
            final Set<String> geneKeys = new HashSet<>();
            for (final AnnPredictionParser.AnnPrediction pred : annParser.getPredictions(ctx)) {
                geneKeys.addAll(pred.getGeneKeys().stream().map(S -> ctx.getContig() + "_" + S).collect(Collectors.toSet()));
            }
            for (final VepPredictionParser.VepPrediction pred : vepParser.getPredictions(ctx)) {
                geneKeys.addAll(pred.getGeneKeys().stream().map(S -> ctx.getContig() + "_" + S).collect(Collectors.toSet()));
            }
            if (sorting == null) {
                sorting = SortingCollection.newInstance(GeneAndVariant.class, new GeneAndVariantCodec(), (A, B) -> {
                    int i = A.gene.compareTo(B.gene);
                    if (i != 0)
                        return i;
                    return A.ctxLine.compareTo(B.ctxLine);
                }, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
                sorting.setDestructiveIteration(true);
            }
            for (final String gk : geneKeys) {
                final GeneAndVariant gav = new GeneAndVariant();
                gav.gene = gk;
                gav.ctxLine = line;
                sorting.add(gav);
            }
        }
        out.flush();
        out.close();
        out = null;
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(out);
    }
}
Also used : AnnPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser) Genotype(htsjdk.variant.variantcontext.Genotype) DataInputStream(java.io.DataInputStream) CloseableIterator(htsjdk.samtools.util.CloseableIterator) LineIterator(htsjdk.tribble.readers.LineIterator) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) AnnPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser) ParametersDelegate(com.beust.jcommander.ParametersDelegate) HashSet(java.util.HashSet) ContigPosRef(com.github.lindenb.jvarkit.util.vcf.ContigPosRef) DataOutputStream(java.io.DataOutputStream) AbstractDataCodec(com.github.lindenb.jvarkit.util.picard.AbstractDataCodec) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) VCFCodec(htsjdk.variant.vcf.VCFCodec) VepPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SortingCollection(htsjdk.samtools.util.SortingCollection) Predicate(java.util.function.Predicate) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) File(java.io.File) List(java.util.List) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) AnnPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParserFactory) VariantContext(htsjdk.variant.variantcontext.VariantContext) VariantContext(htsjdk.variant.variantcontext.VariantContext) LineIterator(htsjdk.tribble.readers.LineIterator) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) AnnPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParserFactory) VCFHeader(htsjdk.variant.vcf.VCFHeader) PrintWriter(java.io.PrintWriter) HashSet(java.util.HashSet) VCFCodec(htsjdk.variant.vcf.VCFCodec) IOException(java.io.IOException) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) VepPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory)

Example 8 with VepPredictionParser

use of com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser in project jvarkit by lindenb.

the class VcfToRdf method scanVCF.

private void scanVCF(final File filein) throws IOException {
    VcfIterator in = null;
    URI source = null;
    try {
        if (filein != null)
            source = filein.toURI();
        in = (filein == null ? VCFUtils.createVcfIteratorStdin() : VCFUtils.createVcfIteratorFromFile(filein));
        final VCFHeader header = in.getHeader();
        final VepPredictionParser vepPredictionParser = new VepPredictionParserFactory(header).get();
        writeHeader(header, source);
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
        while (in.hasNext()) {
            if (this.w.checkError()) {
                LOG.warn("I/O interruption");
                break;
            }
            final VariantContext ctx = progress.watch(in.next());
            /* Variant */
            final URI variant = URI.create("urn:variant/" + ctx.getContig() + ":" + ctx.getStart() + ":" + ctx.getReference().getBaseString());
            emit(variant, "rdf:type", "vcf:Variant", "vcf:chrom", URI.create("urn:chrom/" + ctx.getContig()), "vcf:position", ctx.getStart(), "vcf:ref", ctx.getReference().getBaseString(), "vcf:id", (ctx.hasID() ? ctx.getID() : null), "vcf:qual", (ctx.hasLog10PError() ? ctx.getPhredScaledQual() : null));
            if (this.printAlleles) {
                for (final Allele alt : ctx.getAlternateAlleles()) {
                    emit(variant, "vcf:alt", alt.getBaseString());
                }
            }
            if (this.printFilters) {
                for (final String f : ctx.getFilters()) {
                    emit(variant, "vcf:filter", URI.create("urn:filter/" + f));
                }
            }
            if (this.printVep) {
                for (final VepPrediction prediction : vepPredictionParser.getPredictions(ctx)) {
                /* 
					final List<Object> L=new ArrayList<>();
					L.add("rdf:type");L.add("vep:Prediction");
					L.add("vcf:variant"); L.add(variant);
					L.add("vcf:allele");L.add(prediction.getAllele().getBaseString());
					for(final SequenceOntologyTree.Term term:prediction.getSOTerms())
						{
						L.add("vcf:so");
						L.add(URI.create(term.getUri()));
						}
					if(prediction.getEnsemblTranscript()!=null)
						{
						final  URI transcriptid=URI.create("http://www.ensembl.org/id/"+prediction.getEnsemblTranscript());
						L.add("vep:transcript");
						L.add(transcriptid);

						
						if(prediction.getEnsemblGene()!=null)
							{
							emit(transcriptid,
								"uniprot:transcribedFrom",//used  in uniprot dump
								URI.create("http://www.ensembl.org/id/"+prediction.getEnsemblGene())
								);
							}
						
						if(prediction.getEnsemblProtein()!=null)
							{
							emit(
								transcriptid,
								"uniprot:translatedTo",//used  in uniprot dump
								URI.create("http://www.ensembl.org/id/"+prediction.getEnsemblProtein())
								);
							}
						}
					
					
					
					emit(
						URI.create("urn:vep/"+(++id_generator)),
						L.toArray()
						);
					*/
                }
            }
            if (this.printGenotypes) {
                for (final String sample : ctx.getSampleNames()) {
                    final Genotype g = ctx.getGenotype(sample);
                    final List<Object> L = new ArrayList<>();
                    L.add("vcf:sample");
                    L.add(URI.create("urn:sample/" + sample));
                    L.add("vcf:variant");
                    L.add(variant);
                    L.add("rdf:type");
                    L.add("vcf:Genotype");
                    if (g.hasDP()) {
                        L.add("vcf:dp");
                        L.add(g.getDP());
                    }
                    if (g.hasGQ()) {
                        L.add("vcf:gq");
                        L.add(g.getGQ());
                    }
                    if (g.isCalled()) {
                        if (g.isHet()) {
                            if (g.isHetNonRef()) {
                                L.add("rdf:type");
                                L.add("vcf:HetNonRefGenotype");
                            } else {
                                L.add("rdf:type");
                                L.add("vcf:HetGenotype");
                            }
                        } else if (g.isHom()) {
                            if (g.isHomRef()) {
                                L.add("rdf:type");
                                L.add("vcf:HomRefGenotype");
                            } else {
                                L.add("rdf:type");
                                L.add("vcf:HomVarGenotype");
                            }
                        }
                        for (final Allele a : g.getAlleles()) {
                            L.add("vcf:allele");
                            L.add(a.getBaseString());
                        }
                    }
                    emit(URI.create("urn:gt/" + ctx.getContig() + ":" + ctx.getStart() + ":" + ctx.getReference().getBaseString() + ":" + sample), L.toArray());
                }
            }
        }
        in.close();
        in = null;
        progress.finish();
    } catch (final Exception e) {
        throw new IOException(e);
    } finally {
        CloserUtil.close(in);
    }
}
Also used : VepPrediction(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser.VepPrediction) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) Genotype(htsjdk.variant.variantcontext.Genotype) IOException(java.io.IOException) URI(java.net.URI) IOException(java.io.IOException) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) Allele(htsjdk.variant.variantcontext.Allele) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) VCFHeader(htsjdk.variant.vcf.VCFHeader) VepPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory)

Example 9 with VepPredictionParser

use of com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser in project jvarkit by lindenb.

the class VcfGeneOntology method filterVcfIterator.

private void filterVcfIterator(final VcfIterator in) throws IOException {
    VariantContextWriter w = null;
    try {
        VCFHeader header = in.getHeader();
        VCFHeader h2 = new VCFHeader(header);
        h2.addMetaDataLine(new VCFInfoHeaderLine(TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "GO terms from GO " + GO + " and GOA=" + GOA));
        h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
        h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
        h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkVersion", HtsjdkVersion.getVersion()));
        h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkHome", HtsjdkVersion.getHome()));
        if (filterName != null) {
            h2.addMetaDataLine(new VCFFilterHeaderLine(filterName, "Flag  GO terms " + (inverse_filter ? " not descendant of " : "") + " the provided GO terms"));
        }
        w = super.openVariantContextWriter(outputFile);
        w.writeHeader(h2);
        final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(header.getSequenceDictionary());
        final SnpEffPredictionParser snpEffPredictionParser = new SnpEffPredictionParserFactory().header(header).get();
        final VepPredictionParser vepPredictionParser = new VepPredictionParserFactory().header(header).get();
        while (in.hasNext()) {
            if (System.out.checkError())
                break;
            VariantContext ctx = progess.watch(in.next());
            /* symbols for this variant */
            Set<String> symbols = new HashSet<String>();
            /* scan SNPEFF gene */
            for (SnpEffPrediction pred : snpEffPredictionParser.getPredictions(ctx)) {
                String genName = pred.getGeneName();
                if (genName == null || genName.isEmpty())
                    continue;
                symbols.add(genName);
            }
            /* scan VEP gene */
            for (VepPrediction pred : vepPredictionParser.getPredictions(ctx)) {
                String genName = pred.getGeneName();
                if (!(genName == null || genName.isEmpty())) {
                    symbols.add(genName);
                }
                genName = pred.getGene();
                if (!(genName == null || genName.isEmpty())) {
                    symbols.add(genName);
                }
                genName = pred.getHGNC();
                if (!(genName == null || genName.isEmpty())) {
                    symbols.add(genName);
                }
            }
            /* only keep known GENES from GOA */
            symbols.retainAll(this.name2go.keySet());
            boolean found_child_of_filter = false;
            /* ATTS */
            List<String> atts = new ArrayList<String>();
            /* loop over symbols */
            for (String symbol : symbols) {
                /* go terms associated to this symbol */
                Set<GoTree.Term> t2 = this.name2go.get(symbol);
                if (t2 == null || t2.isEmpty())
                    continue;
                StringBuilder sb = new StringBuilder(symbol);
                sb.append("|");
                boolean first = true;
                for (GoTree.Term gt : t2) {
                    /* user gave terms to filter */
                    if (!found_child_of_filter && this.goTermToFilter != null) {
                        for (GoTree.Term userTerm : this.goTermToFilter) {
                            if (userTerm.hasDescendant(gt.getAcn())) {
                                found_child_of_filter = true;
                                break;
                            }
                        }
                    }
                    if (!first)
                        sb.append("&");
                    sb.append(gt.getAcn());
                    first = false;
                }
                atts.add(sb.toString());
            }
            /* no go term was found */
            if (atts.isEmpty()) {
                if (!removeIfNoGo) {
                    w.add(ctx);
                }
                continue;
            }
            VariantContextBuilder vcb = new VariantContextBuilder(ctx);
            /* check children of user's terms */
            if (this.goTermToFilter != null) {
                /* keep if found children*/
                if ((this.inverse_filter && found_child_of_filter) || (!this.inverse_filter && !found_child_of_filter)) {
                    /* don't remove, but set filter */
                    if (this.filterName != null) {
                        Set<String> filters = new HashSet<String>(ctx.getFilters());
                        filters.add(this.filterName);
                        vcb.filters(filters);
                    } else {
                        continue;
                    }
                }
            }
            /* add go terms */
            vcb.attribute(this.TAG, atts);
            w.add(vcb.make());
        }
        progess.finish();
        w.close();
        w = null;
    } finally {
        CloserUtil.close(w);
        w = null;
    }
}
Also used : VepPrediction(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser.VepPrediction) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) SnpEffPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.SnpEffPredictionParser) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SnpEffPrediction(com.github.lindenb.jvarkit.util.vcf.predictions.SnpEffPredictionParser.SnpEffPrediction) GoTree(com.github.lindenb.jvarkit.util.go.GoTree) SnpEffPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.SnpEffPredictionParserFactory) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VepPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory)

Example 10 with VepPredictionParser

use of com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser in project jvarkit by lindenb.

the class VcfBurdenGoEnrichment method doWork.

@Override
public int doWork(final List<String> args) {
    if (StringUtil.isBlank(this.readingGo.goUri)) {
        LOG.error("Undefined GOs uri.");
        return -1;
    }
    if (this.geneFile == null || !this.geneFile.exists()) {
        LOG.error("Undefined gene file option.");
        return -1;
    }
    try {
        final GoTree gotree = this.readingGo.createParser().setIgnoreDbXRef(true).parse(this.readingGo.goUri);
        List<GoTree.Term> terms = new ArrayList<>(gotree.getTerms());
        final Map<GoTree.Term, Node> term2node = new HashMap<>();
        // build the node TREE
        while (!terms.isEmpty()) {
            int i = 0;
            while (i < terms.size()) {
                final GoTree.Term t = terms.get(i);
                if (!t.hasRelations()) {
                    term2node.put(t, new Node(t));
                    terms.remove(i);
                } else if (t.getRelations().stream().allMatch(L -> term2node.containsKey(L.getTo()))) {
                    final Node n = new Node(t);
                    n.parents.addAll(t.getRelations().stream().map(L -> term2node.get(L.getTo())).collect(Collectors.toSet()));
                    term2node.put(t, n);
                    terms.remove(i);
                } else {
                    i++;
                }
            }
        }
        terms = null;
        final Set<String> unknownAcn = new HashSet<>();
        final Map<String, Set<Node>> gene2node = new HashMap<>();
        final BufferedReader r = IOUtils.openFileForBufferedReading(this.geneFile);
        String line;
        while ((line = r.readLine()) != null) {
            if (line.isEmpty() || line.startsWith("#"))
                continue;
            final int t = line.indexOf('\t');
            if (t == -1) {
                r.close();
                LOG.error("tab missing in " + line + " of " + this.geneFile);
                return -1;
            }
            final String gene = line.substring(0, t).trim();
            if (StringUtil.isBlank(gene)) {
                r.close();
                LOG.error("Emtpy gene in " + line);
                return -1;
            }
            // using getTermByName because found sysnonym in GOA
            final String termAcn = line.substring(t + 1).trim();
            if (unknownAcn.contains(termAcn))
                continue;
            final GoTree.Term term = gotree.getTermByName(termAcn);
            if (term == null && !unknownAcn.contains(termAcn)) {
                unknownAcn.add(termAcn);
                LOG.warning("Don't know this GO term in " + line + " of " + this.geneFile + ". Could be obsolete, synonym, go specific division. Skipping.");
                continue;
            }
            final Node node = term2node.get(term);
            if (node == null) {
                r.close();
                LOG.error("Don't know this node in " + line + " of " + this.geneFile);
                return -1;
            }
            Set<Node> nodes = gene2node.get(gene);
            if (nodes == null) {
                nodes = new HashSet<>();
                gene2node.put(gene, nodes);
            }
            node.numGenes++;
            nodes.add(node);
        }
        ;
        // clean up
        unknownAcn.clear();
        r.close();
        final VcfIterator iter = openVcfIterator(oneFileOrNull(args));
        final VCFHeader header = iter.getHeader();
        final VepPredictionParser vepParser = new VepPredictionParserFactory(header).get();
        final AnnPredictionParser annParser = new AnnPredictionParserFactory(header).get();
        final Set<Pedigree.Person> persons;
        if (this.pedFile != null) {
            final Pedigree pedigree = Pedigree.newParser().parse(this.pedFile);
            persons = new Pedigree.CaseControlExtractor().extract(header, pedigree);
        } else {
            persons = new Pedigree.CaseControlExtractor().extract(header);
        }
        final Set<Pedigree.Person> affected = persons.stream().filter(P -> P.isAffected()).collect(Collectors.toSet());
        final Set<Pedigree.Person> unaffected = persons.stream().filter(P -> P.isUnaffected()).collect(Collectors.toSet());
        if (affected.isEmpty()) {
            LOG.error("No Affected individual");
            return -1;
        }
        if (unaffected.isEmpty()) {
            LOG.error("No unaffected individual");
            return -1;
        }
        final List<String> lookColumns = Arrays.asList("CCDS", "Feature", "ENSP", "Gene", "HGNC", "HGNC_ID", "SYMBOL", "RefSeq");
        final Predicate<Genotype> isWildGenotype = G -> {
            if (G == null)
                return false;
            return G.isHomRef();
        };
        final Predicate<Genotype> isAltGenotype = G -> {
            if (G == null)
                return false;
            return G.isCalled() && !G.isHomRef();
        };
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
        while (iter.hasNext()) {
            final VariantContext ctx = progress.watch(iter.next());
            if (!this.variantFilter.test(ctx))
                continue;
            final Set<String> genes = new HashSet<>();
            for (final String predStr : ctx.getAttributeAsList(vepParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
                final VepPredictionParser.VepPrediction pred = vepParser.parseOnePrediction(ctx, predStr);
                for (final String col : lookColumns) {
                    final String token = pred.getByCol(col);
                    if (!StringUtil.isBlank(token)) {
                        genes.add(token);
                    }
                }
            }
            for (final String predStr : ctx.getAttributeAsList(annParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
                final AnnPredictionParser.AnnPrediction pred = annParser.parseOnePrediction(predStr);
                final String token = pred.getGeneName();
                if (!StringUtil.isBlank(token)) {
                    genes.add(token);
                }
            }
            if (genes.isEmpty())
                continue;
            final Set<Node> nodes = genes.stream().filter(G -> gene2node.containsKey(G)).flatMap(G -> gene2node.get(G).stream()).collect(Collectors.toSet());
            if (nodes.isEmpty())
                continue;
            final long unaffected_alt = unaffected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isAltGenotype).count();
            final long affected_alt = affected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isAltGenotype).count();
            /* no informative */
            if (unaffected_alt + affected_alt == 0L) {
                continue;
            }
            final long affected_ref = affected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isWildGenotype).count();
            final long unaffected_ref = unaffected.stream().map(P -> ctx.getGenotype(P.getId())).filter(G -> this.genotypeFilter.test(ctx, G)).filter(isWildGenotype).count();
            nodes.stream().forEach(N -> N.resetVisitedFlag());
            nodes.stream().forEach(N -> N.visit(unaffected_ref, unaffected_alt, affected_ref, affected_alt));
        }
        iter.close();
        progress.finish();
        LOG.info("Calculating Fisher and dumping.. please wait");
        final PrintWriter pw = super.openFileOrStdoutAsPrintWriter(this.outputFile);
        pw.println("#go_term\tfisher\tname\tgo_term_depth\tcount_genes_in_this_node" + "\tunaffected_ref_gt" + "\tunaffected_alt_gt" + "\taffected_ref_gt" + "\taffected_alt_gt");
        term2node.values().stream().filter(N -> this.show_never_seeen_term || N.sum() > 0L).sorted((n1, n2) -> Double.compare(n1.fisher(), n2.fisher())).forEach(N -> {
            pw.print(N.goTerm.getAcn());
            pw.print('\t');
            pw.print(N.fisher());
            pw.print("\t");
            pw.print(N.goTerm.getName().replaceAll("[ \',\\-]+", "_"));
            pw.print("\t");
            pw.print(N.goTerm.getMinDepth());
            pw.print('\t');
            pw.print(N.numGenes);
            pw.print('\t');
            pw.print(N.unaffected_ref);
            pw.print('\t');
            pw.print(N.unaffected_alt);
            pw.print('\t');
            pw.print(N.affected_ref);
            pw.print('\t');
            pw.print(N.affected_alt);
            pw.println();
        });
        pw.flush();
        pw.close();
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    }
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) Arrays(java.util.Arrays) JexlVariantPredicate(com.github.lindenb.jvarkit.util.vcf.JexlVariantPredicate) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) AnnPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) HashMap(java.util.HashMap) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BiPredicate(java.util.function.BiPredicate) StringUtil(htsjdk.samtools.util.StringUtil) FisherExactTest(com.github.lindenb.jvarkit.math.stats.FisherExactTest) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) Map(java.util.Map) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) VepPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory) PrintWriter(java.io.PrintWriter) JexlGenotypePredicate(com.github.lindenb.jvarkit.util.vcf.JexlGenotypePredicate) GoTree(com.github.lindenb.jvarkit.util.go.GoTree) Predicate(java.util.function.Predicate) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) File(java.io.File) List(java.util.List) AnnPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParserFactory) VariantContext(htsjdk.variant.variantcontext.VariantContext) BufferedReader(java.io.BufferedReader) AnnPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser) HashSet(java.util.HashSet) Set(java.util.Set) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) AnnPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParserFactory) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) PrintWriter(java.io.PrintWriter) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) Genotype(htsjdk.variant.variantcontext.Genotype) GoTree(com.github.lindenb.jvarkit.util.go.GoTree) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) BufferedReader(java.io.BufferedReader) VepPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory)

Aggregations

VepPredictionParser (com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser)10 VepPredictionParserFactory (com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory)10 VariantContext (htsjdk.variant.variantcontext.VariantContext)10 VCFHeader (htsjdk.variant.vcf.VCFHeader)9 ArrayList (java.util.ArrayList)8 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)7 IOException (java.io.IOException)6 HashSet (java.util.HashSet)6 List (java.util.List)6 VcfIterator (com.github.lindenb.jvarkit.util.vcf.VcfIterator)5 Genotype (htsjdk.variant.variantcontext.Genotype)5 File (java.io.File)5 Set (java.util.Set)5 Program (com.github.lindenb.jvarkit.util.jcommander.Program)4 Logger (com.github.lindenb.jvarkit.util.log.Logger)4 AnnPredictionParser (com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser)4 AnnPredictionParserFactory (com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParserFactory)4 VepPrediction (com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser.VepPrediction)4 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)4 Collectors (java.util.stream.Collectors)4