Search in sources :

Example 11 with VCFFilterHeaderLine

use of htsjdk.variant.vcf.VCFFilterHeaderLine in project gatk by broadinstitute.

the class LiftOverVcf method doWork.

@Override
protected Object doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
    IOUtil.assertFileIsReadable(CHAIN);
    IOUtil.assertFileIsWritable(OUTPUT);
    IOUtil.assertFileIsWritable(REJECT);
    ////////////////////////////////////////////////////////////////////////
    // Setup the inputs
    ////////////////////////////////////////////////////////////////////////
    final LiftOver liftOver = new LiftOver(CHAIN);
    final VCFFileReader in = new VCFFileReader(INPUT, false);
    logger.info("Loading up the target reference genome.");
    final ReferenceSequenceFileWalker walker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
    final Map<String, byte[]> refSeqs = new HashMap<>();
    for (final SAMSequenceRecord rec : walker.getSequenceDictionary().getSequences()) {
        refSeqs.put(rec.getSequenceName(), walker.get(rec.getSequenceIndex()).getBases());
    }
    CloserUtil.close(walker);
    ////////////////////////////////////////////////////////////////////////
    // Setup the outputs
    ////////////////////////////////////////////////////////////////////////
    final VCFHeader inHeader = in.getFileHeader();
    final VCFHeader outHeader = new VCFHeader(inHeader);
    outHeader.setSequenceDictionary(walker.getSequenceDictionary());
    final VariantContextWriter out = new VariantContextWriterBuilder().setOption(Options.INDEX_ON_THE_FLY).setOutputFile(OUTPUT).setReferenceDictionary(walker.getSequenceDictionary()).build();
    out.writeHeader(outHeader);
    final VariantContextWriter rejects = new VariantContextWriterBuilder().setOutputFile(REJECT).unsetOption(Options.INDEX_ON_THE_FLY).build();
    final VCFHeader rejectHeader = new VCFHeader(in.getFileHeader());
    for (final VCFFilterHeaderLine line : FILTERS) rejectHeader.addMetaDataLine(line);
    rejects.writeHeader(rejectHeader);
    ////////////////////////////////////////////////////////////////////////
    // Read the input VCF, lift the records over and write to the sorting
    // collection.
    ////////////////////////////////////////////////////////////////////////
    long failedLiftover = 0, failedAlleleCheck = 0, total = 0;
    logger.info("Lifting variants over and sorting.");
    final SortingCollection<VariantContext> sorter = SortingCollection.newInstance(VariantContext.class, new VCFRecordCodec(outHeader), outHeader.getVCFRecordComparator(), MAX_RECORDS_IN_RAM, TMP_DIR);
    ProgressLogger progress = new ProgressLogger(logger, 1000000, "read");
    for (final VariantContext ctx : in) {
        ++total;
        final Interval source = new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd());
        final Interval target = liftOver.liftOver(source, 1.0);
        if (target == null) {
            rejects.add(new VariantContextBuilder(ctx).filter(FILTER_CANNOT_LIFTOVER).make());
            failedLiftover++;
        } else {
            // Fix the alleles if we went from positive to negative strand
            final List<Allele> alleles = new ArrayList<>();
            for (final Allele oldAllele : ctx.getAlleles()) {
                if (target.isPositiveStrand() || oldAllele.isSymbolic()) {
                    alleles.add(oldAllele);
                } else {
                    alleles.add(Allele.create(SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference()));
                }
            }
            // Build the new variant context
            final VariantContextBuilder builder = new VariantContextBuilder(ctx.getSource(), target.getContig(), target.getStart(), target.getEnd(), alleles);
            builder.id(ctx.getID());
            builder.attributes(ctx.getAttributes());
            builder.genotypes(ctx.getGenotypes());
            builder.filters(ctx.getFilters());
            builder.log10PError(ctx.getLog10PError());
            // Check that the reference allele still agrees with the reference sequence
            boolean mismatchesReference = false;
            for (final Allele allele : builder.getAlleles()) {
                if (allele.isReference()) {
                    final byte[] ref = refSeqs.get(target.getContig());
                    final String refString = StringUtil.bytesToString(ref, target.getStart() - 1, target.length());
                    if (!refString.equalsIgnoreCase(allele.getBaseString())) {
                        mismatchesReference = true;
                    }
                    break;
                }
            }
            if (mismatchesReference) {
                rejects.add(new VariantContextBuilder(ctx).filter(FILTER_MISMATCHING_REF_ALLELE).make());
                failedAlleleCheck++;
            } else {
                sorter.add(builder.make());
            }
        }
        progress.record(ctx.getContig(), ctx.getStart());
    }
    final NumberFormat pfmt = new DecimalFormat("0.0000%");
    final String pct = pfmt.format((failedLiftover + failedAlleleCheck) / (double) total);
    logger.info("Processed ", total, " variants.");
    logger.info(Long.toString(failedLiftover), " variants failed to liftover.");
    logger.info(Long.toString(failedAlleleCheck), " variants lifted over but had mismatching reference alleles after lift over.");
    logger.info(pct, " of variants were not successfully lifted over and written to the output.");
    rejects.close();
    in.close();
    ////////////////////////////////////////////////////////////////////////
    // Write the sorted outputs to the final output file
    ////////////////////////////////////////////////////////////////////////
    sorter.doneAdding();
    progress = new ProgressLogger(logger, 1000000, "written");
    logger.info("Writing out sorted records to final VCF.");
    for (final VariantContext ctx : sorter) {
        out.add(ctx);
        progress.record(ctx.getContig(), ctx.getStart());
    }
    out.close();
    sorter.cleanup();
    return null;
}
Also used : LiftOver(htsjdk.samtools.liftover.LiftOver) HashMap(java.util.HashMap) DecimalFormat(java.text.DecimalFormat) VCFFileReader(htsjdk.variant.vcf.VCFFileReader) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) ProgressLogger(org.broadinstitute.hellbender.utils.runtime.ProgressLogger) VCFRecordCodec(htsjdk.variant.vcf.VCFRecordCodec) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) ReferenceSequenceFileWalker(htsjdk.samtools.reference.ReferenceSequenceFileWalker) VCFHeader(htsjdk.variant.vcf.VCFHeader) Allele(htsjdk.variant.variantcontext.Allele) VariantContextWriterBuilder(htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) NumberFormat(java.text.NumberFormat)

Example 12 with VCFFilterHeaderLine

use of htsjdk.variant.vcf.VCFFilterHeaderLine in project jvarkit by lindenb.

the class VariantsInWindow method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter writer) {
    final VCFHeader header = new VCFHeader(in.getHeader());
    if (header.getInfoHeaderLine(this.winName) != null) {
        LOG.error("VCF header already contains the INFO header ID=" + this.winName);
    }
    header.addMetaDataLine(new VCFInfoHeaderLine(this.winName, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Window : start|end|number-of-matching-variants|number-of-non-matching-variants"));
    if (this.filterName != null && this.treshold > 0) {
        if (header.getInfoHeaderLine(this.filterName) != null) {
            LOG.error("VCF header already contains the FORMAT header ID=" + this.filterName);
        }
        header.addMetaDataLine(new VCFFilterHeaderLine(this.filterName, "Filter defined in " + getProgramName()));
    }
    writer.writeHeader(header);
    while (in.hasNext()) {
        final VariantContext ctx = in.next();
        mapVariant(writer, ctx);
    }
    flushVariants(writer, null);
    return 0;
}
Also used : VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine)

Example 13 with VCFFilterHeaderLine

use of htsjdk.variant.vcf.VCFFilterHeaderLine in project jvarkit by lindenb.

the class NoZeroVariationVCF method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
    final VCFHeader header = in.getHeader();
    if (in.hasNext()) {
        LOG.info("found a variant in VCF.");
        VCFUtils.copyHeaderAndVariantsTo(in, out);
    } else {
        LOG.info("no a variant in VCF. Creating a fake Variant");
        header.addMetaDataLine(new VCFFilterHeaderLine("FAKESNP", "Fake SNP created because vcf input was empty."));
        VCFFormatHeaderLine gtHeaderLine = header.getFormatHeaderLine(VCFConstants.GENOTYPE_KEY);
        if (gtHeaderLine == null) {
            LOG.info("Adding GT to header");
            header.addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
        }
        gtHeaderLine = header.getFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY);
        if (gtHeaderLine == null) {
            LOG.info("Adding GQ to header");
            header.addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Genotype Quality"));
        }
        out.writeHeader(header);
        SAMSequenceDictionary dict = this.indexedFastaSequenceFile.getSequenceDictionary();
        // choose random chrom, best is 'random' , but not 1...23,X,Y, etc...
        String chrom = dict.getSequence(0).getSequenceName();
        for (final SAMSequenceRecord ssr : dict.getSequences()) {
            String ssn = ssr.getSequenceName();
            if (ssn.contains("_")) {
                chrom = ssn;
                break;
            }
        }
        for (final SAMSequenceRecord ssr : dict.getSequences()) {
            String ssn = ssr.getSequenceName();
            if (ssn.toLowerCase().contains("random")) {
                chrom = ssn;
                break;
            }
            if (ssn.toLowerCase().contains("gl")) {
                chrom = ssn;
                break;
            }
        }
        GenomicSequence gseq = new GenomicSequence(this.indexedFastaSequenceFile, chrom);
        char ref = 'N';
        char alt = 'N';
        int POS = 0;
        for (POS = 0; POS < gseq.length(); ++POS) {
            ref = Character.toUpperCase(gseq.charAt(POS));
            if (ref == 'N')
                continue;
            switch(ref) {
                case 'A':
                    alt = 'T';
                    break;
                case 'T':
                    alt = 'G';
                    break;
                case 'G':
                    alt = 'C';
                    break;
                case 'C':
                    alt = 'A';
                    break;
                default:
                    break;
            }
            if (alt == 'N')
                continue;
            break;
        }
        if (alt == 'N')
            throw new RuntimeException("found only N");
        VariantContextBuilder vcb = new VariantContextBuilder();
        Allele a1 = Allele.create((byte) ref, true);
        Allele a2 = Allele.create((byte) alt, false);
        List<Allele> la1a2 = new ArrayList<Allele>(2);
        List<Genotype> genotypes = new ArrayList<Genotype>(header.getSampleNamesInOrder().size());
        la1a2.add(a1);
        la1a2.add(a2);
        vcb.chr(gseq.getChrom());
        vcb.start(POS + 1);
        vcb.stop(POS + 1);
        vcb.filter("FAKESNP");
        vcb.alleles(la1a2);
        vcb.log10PError(-0.1);
        for (String sample : header.getSampleNamesInOrder()) {
            final GenotypeBuilder gb = new GenotypeBuilder(sample);
            if (genotypes.isEmpty()) {
                gb.DP(1);
                gb.GQ(1);
                gb.alleles(la1a2);
                gb.noAD();
                gb.noPL();
                genotypes.add(gb.make());
            } else {
                genotypes.add(GenotypeBuilder.createMissing(sample, 2));
            }
        }
        vcb.genotypes(genotypes);
        vcb.noID();
        out.add(vcb.make());
    }
    return 0;
}
Also used : GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) ArrayList(java.util.ArrayList) Genotype(htsjdk.variant.variantcontext.Genotype) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine)

Example 14 with VCFFilterHeaderLine

use of htsjdk.variant.vcf.VCFFilterHeaderLine in project jvarkit by lindenb.

the class FixVCF method doWork.

private int doWork(String filenameIn, InputStream vcfStream, VariantContextWriter w) throws IOException {
    final AbstractVCFCodec vcfCodec = VCFUtils.createDefaultVCFCodec();
    LineIterator r = new LineIteratorImpl(new SynchronousLineReader(vcfStream));
    final VCFHeader header = (VCFHeader) vcfCodec.readActualHeader(r);
    // samples names have been changed by picard api and reordered !!!
    // re-create the original order
    List<String> sampleNamesInSameOrder = new ArrayList<String>(header.getSampleNamesInOrder().size());
    for (int col = 0; col < header.getSampleNamesInOrder().size(); ++col) {
        for (String sample : header.getSampleNameToOffset().keySet()) {
            if (header.getSampleNameToOffset().get(sample) == col) {
                sampleNamesInSameOrder.add(sample);
                break;
            }
        }
    }
    if (sampleNamesInSameOrder.size() != header.getSampleNamesInOrder().size()) {
        throw new IllegalStateException();
    }
    VCFHeader h2 = new VCFHeader(header.getMetaDataInInputOrder(), sampleNamesInSameOrder);
    File tmp = IOUtil.newTempFile("tmp", ".vcf.gz", new File[] { tmpDir });
    tmp.deleteOnExit();
    PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(tmp)));
    while (r.hasNext()) {
        String line = r.next();
        pw.println(line);
        VariantContext ctx = null;
        try {
            ctx = vcfCodec.decode(line);
        } catch (Exception err) {
            pw.close();
            LOG.error(line);
            LOG.error(err);
            return -1;
        }
        for (String f : ctx.getFilters()) {
            if (h2.getFilterHeaderLine(f) != null)
                continue;
            // if(f.equals(VCFConstants.PASSES_FILTERS_v4)) continue; hum...
            if (f.isEmpty() || f.equals(VCFConstants.UNFILTERED))
                continue;
            LOG.info("Fixing missing Filter:" + f);
            h2.addMetaDataLine(new VCFFilterHeaderLine(f));
        }
        for (String tag : ctx.getAttributes().keySet()) {
            if (h2.getInfoHeaderLine(tag) != null)
                continue;
            LOG.info("Fixing missing INFO:" + tag);
            h2.addMetaDataLine(new VCFInfoHeaderLine(tag, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "undefined. Saved by " + getClass()));
        }
    }
    pw.flush();
    pw.close();
    pw = null;
    LOG.info("re-reading VCF frm tmpFile:" + tmp);
    h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName(), "Saved VCF FILTER AND INFO from " + filenameIn));
    // save header in memory
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    VariantContextWriter w2 = VCFUtils.createVariantContextWriterToOutputStream(baos);
    w2.writeHeader(h2);
    w2.close();
    baos.close();
    // reopen tmp file
    @SuppressWarnings("resource") VcfIterator in = new VcfIteratorImpl(new SequenceInputStream(new ByteArrayInputStream(baos.toByteArray()), new GZIPInputStream(new FileInputStream(tmp))));
    w.writeHeader(h2);
    while (in.hasNext()) {
        w.add(in.next());
    }
    in.close();
    tmp.delete();
    return 0;
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) LineIterator(htsjdk.tribble.readers.LineIterator) GZIPInputStream(java.util.zip.GZIPInputStream) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) GZIPOutputStream(java.util.zip.GZIPOutputStream) SynchronousLineReader(htsjdk.tribble.readers.SynchronousLineReader) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) LineIteratorImpl(htsjdk.tribble.readers.LineIteratorImpl) VCFHeader(htsjdk.variant.vcf.VCFHeader) PrintWriter(java.io.PrintWriter) VcfIteratorImpl(com.github.lindenb.jvarkit.util.vcf.VcfIteratorImpl) ByteArrayOutputStream(java.io.ByteArrayOutputStream) AbstractVCFCodec(htsjdk.variant.vcf.AbstractVCFCodec) IOException(java.io.IOException) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) FileInputStream(java.io.FileInputStream) SequenceInputStream(java.io.SequenceInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) FileOutputStream(java.io.FileOutputStream) File(java.io.File)

Example 15 with VCFFilterHeaderLine

use of htsjdk.variant.vcf.VCFFilterHeaderLine in project jvarkit by lindenb.

the class VcfToRdf method writeHeader.

private void writeHeader(final VCFHeader header, final URI source) {
    if (source != null) {
        emit(source, "rdf:type", "vcf:File");
    }
    final SAMSequenceDictionary dict = header.getSequenceDictionary();
    if (dict != null) {
        for (final SAMSequenceRecord ssr : dict.getSequences()) {
            emit(URI.create("urn:chrom/" + ssr.getSequenceName()), "rdf:type", "vcf:Chromosome", "dc:title", ssr.getSequenceName(), "vcf:length", ssr.getSequenceLength(), "vcf:sequenceIndex", ssr.getSequenceIndex());
        }
    }
    for (final VCFFilterHeaderLine h : header.getFilterLines()) {
        emit(URI.create("urn:filter/" + h.getID()), "rdf:type", "vcf:Filter", "dc:title", h.getID(), "dc:description", h.getValue());
    }
    final Pedigree pedigree = Pedigree.newParser().parse(header);
    for (final Pedigree.Person pe : pedigree.getPersons()) {
        final URI sample = URI.create("urn:sample/" + pe.getId());
        emit(sample, "rdf:type", "foaf:Person", "foaf:name", pe.getId(), "foaf:family", pe.getFamily().getId());
        if (pe.isMale())
            emit(sample, "idt:gender", "male");
        if (pe.isFemale())
            emit(sample, "idt:gender", "female");
        if (pe.isAffected())
            emit(sample, "idt:status", "affected");
        if (pe.isUnaffected())
            emit(sample, "idt:status", "unaffected");
    }
    // Sample
    for (final String sample : header.getSampleNamesInOrder()) {
        emit(URI.create("urn:sample/" + sample), "rdf:type", "vcf:Sample", "dc:title", sample);
    }
}
Also used : Pedigree(com.github.lindenb.jvarkit.util.Pedigree) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) URI(java.net.URI)

Aggregations

VCFFilterHeaderLine (htsjdk.variant.vcf.VCFFilterHeaderLine)25 VCFHeader (htsjdk.variant.vcf.VCFHeader)23 VariantContext (htsjdk.variant.variantcontext.VariantContext)17 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)16 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)15 ArrayList (java.util.ArrayList)15 VCFInfoHeaderLine (htsjdk.variant.vcf.VCFInfoHeaderLine)14 VCFHeaderLine (htsjdk.variant.vcf.VCFHeaderLine)13 File (java.io.File)13 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)12 HashSet (java.util.HashSet)11 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)10 Allele (htsjdk.variant.variantcontext.Allele)10 VCFFormatHeaderLine (htsjdk.variant.vcf.VCFFormatHeaderLine)9 List (java.util.List)9 Genotype (htsjdk.variant.variantcontext.Genotype)8 GenotypeBuilder (htsjdk.variant.variantcontext.GenotypeBuilder)8 Set (java.util.Set)8 Collectors (java.util.stream.Collectors)8 Parameter (com.beust.jcommander.Parameter)7