Search in sources :

Example 11 with VCFReader

use of htsjdk.variant.vcf.VCFReader in project jvarkit by lindenb.

the class MantaMerger method doWork.

@Override
public int doWork(final List<String> args) {
    VariantContextWriter out = null;
    try {
        final Map<String, VcfInput> sample2inputs = new TreeMap<>();
        SAMSequenceDictionary dict = null;
        final List<String> lines;
        if (args.size() == 1 && args.get(0).endsWith(".list")) {
            lines = Files.lines(Paths.get(args.get(0))).filter(L -> !(StringUtils.isBlank(L) || L.startsWith("#"))).collect(Collectors.toList());
        } else {
            lines = args;
        }
        for (final String line : lines) {
            final String[] tokens = CharSplitter.TAB.split(line);
            final VcfInput vcfInput = new VcfInput();
            vcfInput.vcfPath = Paths.get(tokens[0]);
            IOUtil.assertFileIsReadable(vcfInput.vcfPath);
            final SAMSequenceDictionary dict1 = SequenceDictionaryUtils.extractRequired(vcfInput.vcfPath);
            if (dict == null) {
                dict = dict1;
            } else if (!SequenceUtil.areSequenceDictionariesEqual(dict, dict1)) {
                throw new JvarkitException.DictionariesAreNotTheSame(dict1, dict);
            }
            if (tokens.length < 2 || StringUtils.isBlank(tokens[1])) {
                try (VCFReader r = VCFReaderFactory.makeDefault().open(vcfInput.vcfPath, false)) {
                    List<String> snl = r.getHeader().getSampleNamesInOrder();
                    if (snl.size() == 1) {
                        vcfInput.sample = snl.get(0);
                    } else {
                        vcfInput.sample = vcfInput.vcfPath.toString();
                    }
                }
            } else {
                vcfInput.sample = tokens[1];
            }
            if (sample2inputs.containsKey(vcfInput.sample)) {
                LOG.error("duplicate sample " + vcfInput.sample);
                return -1;
            }
            sample2inputs.put(vcfInput.sample, vcfInput);
        }
        if (sample2inputs.isEmpty()) {
            LOG.error("no input found");
            return -1;
        }
        if (!StringUtils.isBlank(this.limitContig) && dict.getSequence(this.limitContig) == null) {
            LOG.error(JvarkitException.ContigNotFoundInDictionary.getMessage(this.limitContig, dict));
            return -1;
        }
        final Predicate<VariantContext> bedPredicate;
        if (this.excludeBedPath != null) {
            final BedLineCodec codec = new BedLineCodec();
            final ContigNameConverter converter = ContigNameConverter.fromOneDictionary(dict);
            final IntervalTreeMap<Boolean> map = new IntervalTreeMap<>();
            try (BufferedReader br = IOUtils.openPathForBufferedReading(this.excludeBedPath)) {
                br.lines().filter(L -> !BedLine.isBedHeader(L)).map(L -> codec.decode(L)).filter(L -> L != null).filter(B -> !StringUtils.isBlank(converter.apply(B.getContig()))).map(B -> new Interval(converter.apply(B.getContig()), B.getStart(), B.getEnd())).forEach(R -> map.put(R, true));
            }
            bedPredicate = V -> !map.containsOverlapping(V);
        } else {
            bedPredicate = V -> true;
        }
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY, true));
        metaData.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Variation type"));
        final VCFInfoHeaderLine infoSvLen = new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Variation length");
        metaData.add(infoSvLen);
        final VCFInfoHeaderLine infoNSamples = new VCFInfoHeaderLine("NSAMPLES", 1, VCFHeaderLineType.Integer, "Number of samples");
        metaData.add(infoNSamples);
        final VCFInfoHeaderLine infoSamples = new VCFInfoHeaderLine("SAMPLES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "amples");
        metaData.add(infoSamples);
        final VCFFilterHeaderLine filterSameNext = new VCFFilterHeaderLine("NEXT", "next variant in VCF is the same.");
        metaData.add(filterSameNext);
        final VCFFilterHeaderLine filterSamePrev = new VCFFilterHeaderLine("PREV", "next variant in VCF is the same.");
        metaData.add(filterSamePrev);
        final VCFHeader header = new VCFHeader(metaData, sample2inputs.keySet());
        header.setSequenceDictionary(dict);
        JVarkitVersion.getInstance().addMetaData(this, header);
        out = VCFUtils.createVariantContextWriterToPath(this.outputFile);
        out.writeHeader(header);
        final Decoy decoy = Decoy.getDefaultInstance();
        for (final SAMSequenceRecord ssr : dict.getSequences()) {
            if (!StringUtils.isBlank(this.limitContig)) {
                if (!ssr.getSequenceName().equals(this.limitContig))
                    continue;
            }
            LOG.info("contig " + ssr.getSequenceName());
            if (decoy.isDecoy(ssr.getSequenceName()))
                continue;
            final Map<SVKey, Set<String>> variants2samples = new HashMap<>();
            for (final VcfInput vcfinput : sample2inputs.values()) {
                // reset count for this contig
                vcfinput.contigCount = 0;
                try (VCFReader vcfFileReader = VCFReaderFactory.makeDefault().open(vcfinput.vcfPath, true)) {
                    vcfFileReader.query(ssr.getSequenceName(), 1, ssr.getSequenceLength()).stream().filter(V -> discard_bnd == false || !V.getAttributeAsString(VCFConstants.SVTYPE, "").equals("BND")).filter(bedPredicate).map(V -> new VariantContextBuilder(V).unfiltered().noID().noGenotypes().rmAttribute("EVENT").rmAttribute("HOMSEQ").rmAttribute("HOMLEN").rmAttribute("SVINSSEQ").rmAttribute("SVINSLEN").rmAttribute("MATEID").rmAttribute("LEFT_SVINSSEQ").rmAttribute("RIGHT_SVINSSEQ").rmAttribute("BND_DEPTH").rmAttribute("MATE_BND_DEPTH").rmAttribute("JUNCTION_QUAL").rmAttribute("CIGAR").make()).forEach(V -> {
                        final SVKey key1 = new SVKey(V);
                        if (!svComparator.test(V, V))
                            throw new RuntimeException("compare to self failed ! " + V);
                        variants2samples.put(key1, new HashSet<>());
                        vcfinput.contigCount++;
                    });
                }
            }
            if (variants2samples.isEmpty())
                continue;
            // build an interval tree for a faster access
            final IntervalTree<SVKey> intervalTree = new IntervalTree<>();
            for (final SVKey key : variants2samples.keySet()) {
                final SimpleInterval r = new SimpleInterval(key.archetype).extend(this.svComparator.getBndDistance() + 1);
                intervalTree.put(r.getStart(), r.getEnd(), key);
                // paranoidcheck interval is ok to find current archetype
                boolean found = false;
                final Iterator<IntervalTree.Node<SVKey>> nodeIter = intervalTree.overlappers(r.getStart(), r.getEnd());
                while (nodeIter.hasNext()) {
                    final SVKey key1 = nodeIter.next().getValue();
                    if (this.svComparator.test(key1.archetype, key.archetype)) {
                        found = true;
                        break;
                    }
                }
                if (!found) {
                    out.close();
                    throw new RuntimeException("cannot find self " + key.archetype + " in " + r);
                }
            }
            for (final VcfInput vcfinput : sample2inputs.values()) {
                try (VCFReader vcfFileReader = VCFReaderFactory.makeDefault().open(vcfinput.vcfPath, true)) {
                    final CloseableIterator<VariantContext> iter = vcfFileReader.query(ssr.getSequenceName(), 1, ssr.getSequenceLength());
                    while (iter.hasNext()) {
                        final VariantContext ctx = iter.next();
                        if (this.discard_bnd && ctx.getAttributeAsString(VCFConstants.SVTYPE, "").equals("BND"))
                            continue;
                        if (!bedPredicate.test(ctx))
                            continue;
                        final SimpleInterval r = new SimpleInterval(ctx).extend(this.svComparator.getBndDistance() + 1);
                        final Iterator<IntervalTree.Node<SVKey>> nodeIter = intervalTree.overlappers(r.getStart(), r.getEnd());
                        while (nodeIter.hasNext()) {
                            final SVKey key1 = nodeIter.next().getValue();
                            if (!this.svComparator.test(key1.archetype, ctx))
                                continue;
                            final Set<String> samples = variants2samples.get(key1);
                            samples.add(vcfinput.sample);
                        }
                    }
                    iter.close();
                }
            }
            final Comparator<VariantContext> sorter = new ContigDictComparator(dict).createLocatableComparator();
            final List<SVKey> orderedKeys = variants2samples.keySet().stream().filter(// no samples for this key ??!
            K -> !variants2samples.get(K).isEmpty()).sorted((A, B) -> sorter.compare(A.archetype, B.archetype)).collect(Collectors.toCollection(ArrayList::new));
            // remove duplicates
            int i = 0;
            while (i + 1 < orderedKeys.size()) {
                final SVKey key1 = orderedKeys.get(i);
                final SVKey key2 = orderedKeys.get(i + 1);
                if (svComparator.test(key1.archetype, key2.archetype) && // same samples
                variants2samples.get(key1).equals(variants2samples.get(key2))) {
                    orderedKeys.remove(i + 1);
                } else {
                    i++;
                }
            }
            for (int key_index = 0; key_index < orderedKeys.size(); key_index++) {
                final SVKey key = orderedKeys.get(key_index);
                final Set<String> samples = variants2samples.get(key);
                final Allele refAllele = key.archetype.getReference();
                final Allele altAllele = Allele.create("<SV>", false);
                final Object svType = key.archetype.getAttribute(VCFConstants.SVTYPE, ".");
                final VariantContextBuilder vcb = new VariantContextBuilder();
                vcb.chr(key.archetype.getContig());
                vcb.start(key.archetype.getStart());
                vcb.stop(key.archetype.getEnd());
                vcb.log10PError(key.archetype.getLog10PError());
                vcb.alleles(Arrays.asList(refAllele, altAllele));
                vcb.attribute(VCFConstants.END_KEY, key.archetype.getEnd());
                vcb.attribute(VCFConstants.SVTYPE, svType);
                vcb.attribute(infoSvLen.getID(), (svType.equals("DEL") ? -1 : 1) * key.archetype.getLengthOnReference());
                vcb.attribute(infoNSamples.getID(), samples.size());
                vcb.attribute(infoSamples.getID(), samples.stream().sorted().collect(Collectors.toList()));
                int ac = 0;
                final List<Genotype> genotypes = new ArrayList<>(sample2inputs.size());
                for (final String sn : sample2inputs.keySet()) {
                    List<Allele> gta;
                    if (samples.contains(sn)) {
                        ac++;
                        gta = Arrays.asList(refAllele, altAllele);
                    } else {
                        gta = Arrays.asList(refAllele, refAllele);
                    }
                    genotypes.add(new GenotypeBuilder(sn, gta).make());
                }
                vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, ac);
                vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, sample2inputs.size() * 2);
                if (ac > 0) {
                    vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, ac / (float) sample2inputs.size() * 2);
                }
                if (key_index > 0 && svComparator.test(key.archetype, orderedKeys.get(key_index - 1).archetype)) {
                    vcb.filter(filterSamePrev.getID());
                }
                if (key_index + 1 < orderedKeys.size() && svComparator.test(key.archetype, orderedKeys.get(key_index + 1).archetype)) {
                    System.err.println("SAME\n" + key.archetype + "\n" + orderedKeys.get(key_index + 1).archetype);
                    vcb.filter(filterSameNext.getID());
                }
                vcb.genotypes(genotypes);
                out.add(vcb.make());
            }
        }
        out.close();
        out = null;
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(out);
    }
}
Also used : Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) IOUtil(htsjdk.samtools.util.IOUtil) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) Map(java.util.Map) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Predicate(java.util.function.Predicate) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) CloseableIterator(htsjdk.samtools.util.CloseableIterator) SequenceUtil(htsjdk.samtools.util.SequenceUtil) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) HashMap(java.util.HashMap) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Decoy(com.github.lindenb.jvarkit.samtools.Decoy) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Interval(htsjdk.samtools.util.Interval) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) VCFConstants(htsjdk.variant.vcf.VCFConstants) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) Iterator(java.util.Iterator) Files(java.nio.file.Files) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFReader(htsjdk.variant.vcf.VCFReader) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) StructuralVariantComparator(com.github.lindenb.jvarkit.variant.sv.StructuralVariantComparator) IntervalTree(htsjdk.samtools.util.IntervalTree) TreeMap(java.util.TreeMap) Paths(java.nio.file.Paths) BufferedReader(java.io.BufferedReader) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Comparator(java.util.Comparator) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) IntervalTree(htsjdk.samtools.util.IntervalTree) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) HashSet(java.util.HashSet) Decoy(com.github.lindenb.jvarkit.samtools.Decoy) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) Interval(htsjdk.samtools.util.Interval) Set(java.util.Set) HashSet(java.util.HashSet) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) VCFReader(htsjdk.variant.vcf.VCFReader) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) TreeMap(java.util.TreeMap) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) BufferedReader(java.io.BufferedReader) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap)

Example 12 with VCFReader

use of htsjdk.variant.vcf.VCFReader in project jvarkit by lindenb.

the class VcfStrechToSvg method doWork.

@Override
public int doWork(final List<String> args) {
    try {
        this.afExtractor = new AFExtractorFactory().parseFieldExtractor(this.afExtractorFactoryStr);
        final String input = super.oneAndOnlyOneFile(args);
        if (!this.bamListPath.isEmpty()) {
            LOG.info("reading bam list");
            for (Path bamPath : IOUtils.unrollPaths(this.bamListPath)) {
                try (SamReader sr = openSamReader(bamPath)) {
                    final SAMFileHeader hdr = sr.getFileHeader();
                    for (final String sn : hdr.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toSet())) {
                        if (this.sample2bam.containsKey(sn)) {
                            LOG.info("duplicate sample in bam " + bamPath + " and " + this.sample2bam.get(sn));
                            return -1;
                        }
                        this.sample2bam.put(sn, bamPath);
                    }
                }
            }
        }
        try (VCFReader r = VCFReaderFactory.makeDefault().open(input, true)) {
            final VCFHeader header = r.getHeader();
            this.afExtractor.validateHeader(header);
            final String searchFormat;
            switch(this.useFormat) {
                case PL:
                    searchFormat = VCFConstants.GENOTYPE_PL_KEY;
                    break;
                // through
                case AD:
                default:
                    searchFormat = VCFConstants.GENOTYPE_ALLELE_DEPTHS;
                    break;
            }
            if (header.getFormatHeaderLine(searchFormat) == null) {
                LOG.error("FORMAT/" + searchFormat + " undefined in " + input);
                return -1;
            }
            if (!header.hasGenotypingData()) {
                LOG.error("No genotype in input");
                return -1;
            }
            final SAMSequenceDictionary dict = header.getSequenceDictionary();
            if (dict != null && this.refFaidx != null) {
                SequenceUtil.assertSequenceDictionariesEqual(dict, SequenceDictionaryUtils.extractRequired(this.refFaidx));
            }
            if (this.gff3Path != null) {
                LOG.info("reading gtf" + this.gff3Path);
                try (GtfReader gtfReader = new GtfReader(this.gff3Path)) {
                    if (dict != null)
                        gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
                    gtfReader.getAllGenes().forEach(G -> {
                        this.all_genes.put(new Interval(G), G);
                    });
                }
            }
            try (BufferedReader br = IOUtils.openPathForBufferedReading(this.bedFile)) {
                try (PrintWriter manifest = (this.manifestPath == null ? new PrintWriter(new NullOuputStream()) : IOUtils.openPathForPrintWriter(this.manifestPath))) {
                    try (ArchiveFactory out = ArchiveFactory.open(this.outputFile)) {
                        final BedLineCodec codec = new BedLineCodec();
                        br.lines().map(L -> codec.decode(L)).filter(B -> B != null).forEach(B -> {
                            run(out, B, header, r, manifest);
                        });
                    }
                    manifest.flush();
                }
            }
        }
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
    }
}
Also used : Path(java.nio.file.Path) Transformer(javax.xml.transform.Transformer) Text(org.w3c.dom.Text) Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) VCFHeader(htsjdk.variant.vcf.VCFHeader) StreamResult(javax.xml.transform.stream.StreamResult) Gene(com.github.lindenb.jvarkit.util.bio.structure.Gene) SVG(com.github.lindenb.jvarkit.util.svg.SVG) SAMFileHeader(htsjdk.samtools.SAMFileHeader) Hyperlink(com.github.lindenb.jvarkit.net.Hyperlink) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) CoverageFactory(com.github.lindenb.jvarkit.samtools.CoverageFactory) Document(org.w3c.dom.Document) Map(java.util.Map) Path(java.nio.file.Path) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) SAMException(htsjdk.samtools.SAMException) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) AFExtractorFactory(com.github.lindenb.jvarkit.util.vcf.AFExtractorFactory) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) ToDoubleFunction(java.util.function.ToDoubleFunction) VariantContext(htsjdk.variant.variantcontext.VariantContext) GZIPOutputStream(java.util.zip.GZIPOutputStream) DocumentBuilderFactory(javax.xml.parsers.DocumentBuilderFactory) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) Genotype(htsjdk.variant.variantcontext.Genotype) DOMSource(javax.xml.transform.dom.DOMSource) CloseableIterator(htsjdk.samtools.util.CloseableIterator) SequenceUtil(htsjdk.samtools.util.SequenceUtil) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) OptionalDouble(java.util.OptionalDouble) Exon(com.github.lindenb.jvarkit.util.bio.structure.Exon) HashMap(java.util.HashMap) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) Interval(htsjdk.samtools.util.Interval) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) Node(org.w3c.dom.Node) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) VCFConstants(htsjdk.variant.vcf.VCFConstants) OutputStream(java.io.OutputStream) Locatable(htsjdk.samtools.util.Locatable) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) DecimalFormat(java.text.DecimalFormat) VCFReader(htsjdk.variant.vcf.VCFReader) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) SamReader(htsjdk.samtools.SamReader) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) Element(org.w3c.dom.Element) FractionConverter(com.github.lindenb.jvarkit.jcommander.converter.FractionConverter) DocumentBuilder(javax.xml.parsers.DocumentBuilder) DynamicParameter(com.beust.jcommander.DynamicParameter) BufferedReader(java.io.BufferedReader) TransformerFactory(javax.xml.transform.TransformerFactory) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) SamReader(htsjdk.samtools.SamReader) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) AFExtractorFactory(com.github.lindenb.jvarkit.util.vcf.AFExtractorFactory) VCFReader(htsjdk.variant.vcf.VCFReader) BufferedReader(java.io.BufferedReader) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) SAMFileHeader(htsjdk.samtools.SAMFileHeader) VCFHeader(htsjdk.variant.vcf.VCFHeader) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) Interval(htsjdk.samtools.util.Interval) PrintWriter(java.io.PrintWriter)

Example 13 with VCFReader

use of htsjdk.variant.vcf.VCFReader in project jvarkit by lindenb.

the class TestNg01 method getVcfHeader.

static VCFHeader getVcfHeader(final File f) {
    final VCFReader r = VCFReaderFactory.makeDefault().open(f, false);
    final VCFHeader h = r.getHeader();
    CloserUtil.close(r);
    return h;
}
Also used : VCFReader(htsjdk.variant.vcf.VCFReader) VCFHeader(htsjdk.variant.vcf.VCFHeader)

Example 14 with VCFReader

use of htsjdk.variant.vcf.VCFReader in project jvarkit by lindenb.

the class BamAlleleBalance method doWork.

@Override
public int doWork(List<String> args) {
    try {
        final List<Path> bamPaths = IOUtils.unrollPaths(args);
        if (bamPaths.isEmpty()) {
            LOG.error("Bam list is empty");
            return -1;
        }
        final List<Snp> snps = new ArrayList<>(10_000);
        try (VCFReader vr = VCFReaderFactory.makeDefault().open(this.variantsPath, false)) {
            try (CloseableIterator<VariantContext> iter = vr.iterator()) {
                while (iter.hasNext()) {
                    final VariantContext ctx = iter.next();
                    if (!ctx.isBiallelic() || ctx.isFiltered() || !ctx.isSNP())
                        continue;
                    snps.add(new Snp(ctx));
                }
            }
        }
        if (snps.isEmpty()) {
            LOG.error("snp list is empty");
            return -1;
        }
        LOG.info(String.valueOf(snps.size()) + " snp(s)");
        final Map<String, Counter<RangeOfDoubles.Range>> sample2count = new HashMap<>(bamPaths.size());
        final SamReaderFactory srf = super.createSamReaderFactory();
        if (this.faidx != null)
            srf.referenceSequence(this.faidx);
        for (final Path bamPath : bamPaths) {
            try (SamReader samReader = srf.open(bamPath)) {
                final String defaultPartition = IOUtils.getFilenameWithoutCommonSuffixes(bamPath);
                final SAMFileHeader header = samReader.getFileHeader();
                final Set<String> samples = header.getReadGroups().stream().map(RG -> this.groupBy.apply(RG, defaultPartition)).collect(Collectors.toSet());
                samples.stream().forEach(SN -> sample2count.putIfAbsent(SN, new Counter<>()));
                final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
                final ContigNameConverter converter = ContigNameConverter.fromOneDictionary(dict);
                for (final Snp snp : snps) {
                    final String bamContig = converter.apply(snp.contigSrc);
                    if (StringUtils.isBlank(bamContig))
                        continue;
                    final Map<String, SiteInfo> sample2siteinfo = new HashMap<>(samples.size());
                    for (final String sn : samples) {
                        sample2siteinfo.put(sn, new SiteInfo());
                    }
                    try (CloseableIterator<SAMRecord> iter = samReader.queryOverlapping(bamContig, snp.pos1, snp.pos1)) {
                        while (iter.hasNext()) {
                            final SAMRecord rec = iter.next();
                            if (!SAMRecordDefaultFilter.accept(rec, this.mapq))
                                continue;
                            final SAMReadGroupRecord rg = rec.getReadGroup();
                            if (rg == null)
                                continue;
                            String sample = this.groupBy.apply(rg, defaultPartition);
                            if (StringUtils.isBlank(sample) || !sample2siteinfo.containsKey(sample))
                                continue;
                            final Cigar cigar = rec.getCigar();
                            if (cigar == null || cigar.isEmpty())
                                continue;
                            byte[] bases = rec.getReadBases();
                            if (bases == null || bases == SAMRecord.NULL_SEQUENCE)
                                continue;
                            final int readpos1 = rec.getReadPositionAtReferencePosition(snp.pos1);
                            if (readpos1 < 1)
                                continue;
                            final int readpos0 = readpos1 - 1;
                            if (readpos0 < 0 || readpos0 >= bases.length)
                                continue;
                            final char base = (char) Character.toUpperCase(bases[readpos0]);
                            final SiteInfo si = sample2siteinfo.get(sample);
                            if (si == null)
                                continue;
                            if (base == snp.ref) {
                                si.countRef++;
                            } else if (base == snp.alt) {
                                si.countAlt++;
                            }
                        }
                        for (final String sample : sample2siteinfo.keySet()) {
                            final SiteInfo si = sample2siteinfo.get(sample);
                            final int depth = si.countAlt + si.countRef;
                            if (depth < this.min_depth)
                                continue;
                            // must be het
                            if (si.countAlt == 0 || si.countRef == 0)
                                continue;
                            final double ratio = si.countAlt / (double) depth;
                            final RangeOfDoubles.Range range = this.ratios.getRange(ratio);
                            sample2count.get(sample).incr(range);
                        }
                    }
                // end of iter
                }
            // end of loop over snps
            }
        }
        // print report
        try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
            final List<String> sortedSamples = sample2count.keySet().stream().sorted((A, B) -> Long.compare(sample2count.get(A).getTotal(), sample2count.get(B).getTotal())).collect(Collectors.toList());
            pw.print("RANGE");
            for (final String sn : sortedSamples) {
                pw.print("\t");
                pw.print(sn);
            }
            pw.println();
            for (final RangeOfDoubles.Range r : this.ratios.getRanges()) {
                pw.print(r.toString());
                for (final String sn : sortedSamples) {
                    pw.print("\t");
                    pw.print(sample2count.get(sn).count(r));
                }
                pw.println();
            }
            pw.flush();
        }
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    }
}
Also used : Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) HashMap(java.util.HashMap) SAMRecordPartition(com.github.lindenb.jvarkit.util.samtools.SAMRecordPartition) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ArrayList(java.util.ArrayList) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) SAMRecordDefaultFilter(com.github.lindenb.jvarkit.samtools.SAMRecordDefaultFilter) Map(java.util.Map) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) RangeOfDoubles(com.github.lindenb.jvarkit.math.RangeOfDoubles) Path(java.nio.file.Path) PrintWriter(java.io.PrintWriter) Counter(com.github.lindenb.jvarkit.util.Counter) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFReader(htsjdk.variant.vcf.VCFReader) Set(java.util.Set) SamReader(htsjdk.samtools.SamReader) Collectors(java.util.stream.Collectors) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContext(htsjdk.variant.variantcontext.VariantContext) SamReaderFactory(htsjdk.samtools.SamReaderFactory) HashMap(java.util.HashMap) RangeOfDoubles(com.github.lindenb.jvarkit.math.RangeOfDoubles) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SamReader(htsjdk.samtools.SamReader) Counter(com.github.lindenb.jvarkit.util.Counter) VCFReader(htsjdk.variant.vcf.VCFReader) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) PrintWriter(java.io.PrintWriter) Path(java.nio.file.Path) SamReaderFactory(htsjdk.samtools.SamReaderFactory) Cigar(htsjdk.samtools.Cigar) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader)

Example 15 with VCFReader

use of htsjdk.variant.vcf.VCFReader in project jvarkit by lindenb.

the class AbstractBam2Raster method loadVCFs.

protected void loadVCFs() {
    for (final String vcfFile : IOUtils.unrollFiles(variants)) {
        try (final VCFReader vcfFileReader = VCFReaderFactory.makeDefault().open(Paths.get(vcfFile), true)) {
            final VCFHeader header = vcfFileReader.getHeader();
            final SAMSequenceDictionary dict = header.getSequenceDictionary();
            final ContigNameConverter converter;
            if (dict != null) {
                converter = ContigNameConverter.fromOneDictionary(dict);
            } else {
                converter = ContigNameConverter.getIdentity();
            }
            final String newCtg = converter.apply(this.interval.getContig());
            if (!StringUtil.isBlank(newCtg)) {
                final CloseableIterator<VariantContext> r = vcfFileReader.query(this.interval.getContig(), this.interval.getStart(), this.interval.getEnd());
                while (r.hasNext()) {
                    this.highlightPositions.add(r.next().getStart());
                }
                r.close();
            }
        } catch (final IOException err) {
            throw new RuntimeIOException(err);
        }
    }
}
Also used : RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) VCFReader(htsjdk.variant.vcf.VCFReader) VariantContext(htsjdk.variant.variantcontext.VariantContext) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOException(java.io.IOException) VCFHeader(htsjdk.variant.vcf.VCFHeader) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter)

Aggregations

VCFReader (htsjdk.variant.vcf.VCFReader)60 VariantContext (htsjdk.variant.variantcontext.VariantContext)45 Path (java.nio.file.Path)41 VCFHeader (htsjdk.variant.vcf.VCFHeader)37 VCFReaderFactory (com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory)34 CloseableIterator (htsjdk.samtools.util.CloseableIterator)32 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)31 Collectors (java.util.stream.Collectors)30 ArrayList (java.util.ArrayList)29 List (java.util.List)29 Logger (com.github.lindenb.jvarkit.util.log.Logger)28 Parameter (com.beust.jcommander.Parameter)27 Program (com.github.lindenb.jvarkit.util.jcommander.Program)26 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)24 Set (java.util.Set)22 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)21 IOException (java.io.IOException)21 CloserUtil (htsjdk.samtools.util.CloserUtil)20 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)20 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)19