Search in sources :

Example 66 with SimpleInterval

use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.

the class VCFBed method doVcfToVcf.

@Override
protected int doVcfToVcf(final String inputName, final VCFIterator vcfin, final VariantContextWriter out) {
    final VCFHeader header = vcfin.getHeader();
    final VCFHeader h2 = new VCFHeader(header);
    final VCFInfoHeaderLine infoHeader = new VCFInfoHeaderLine(this.infoName, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "metadata added from " + this.inputBedFile + " . Format was " + this.formatPattern.replaceAll("[\"\'\\\\]", " "));
    final VCFInfoHeaderLine infoCountBasicOverlap = new VCFInfoHeaderLine(this.infoName + "_N", 1, VCFHeaderLineType.Integer, "Number of raw overlap within distance " + this.within_distance + "bp.");
    final VCFInfoHeaderLine infoCountFinerOverlap = new VCFInfoHeaderLine(this.infoName + "_C", 1, VCFHeaderLineType.Integer, "Number of finer overlap within distance " + inputBedFile + " and overlap bed:" + this.min_overlap_bed_fraction + " and overlap vcf: " + this.min_overlap_vcf_fraction);
    if (infoCountBasicOverlap != null)
        h2.addMetaDataLine(infoCountBasicOverlap);
    if (infoCountFinerOverlap != null)
        h2.addMetaDataLine(infoCountFinerOverlap);
    h2.addMetaDataLine(infoHeader);
    JVarkitVersion.getInstance().addMetaData(this, h2);
    out.writeHeader(h2);
    while (vcfin.hasNext()) {
        final VariantContext ctx = vcfin.next();
        final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
        vcb.rmAttribute(infoCountBasicOverlap.getID());
        vcb.rmAttribute(infoCountFinerOverlap.getID());
        vcb.rmAttribute(infoHeader.getID());
        if (this.ignoreFILTERed && ctx.isFiltered()) {
            out.add(vcb.make());
            continue;
        }
        final String normalizedContig = this.contigNameConverter.apply(ctx.getContig());
        if (StringUtil.isBlank(normalizedContig)) {
            out.add(ctx);
            continue;
        }
        final Optional<? extends Locatable> bndOpt = this.disable_bnd_eval ? Optional.empty() : Breakend.parseInterval(ctx);
        final SimpleInterval theInterval;
        if (bndOpt.isPresent()) {
            theInterval = new SimpleInterval(normalizedContig, bndOpt.get().getStart(), bndOpt.get().getEnd());
        } else {
            theInterval = new SimpleInterval(normalizedContig, ctx.getStart(), ctx.getEnd());
        }
        final SimpleInterval extendedInterval;
        if (this.within_distance <= 0) {
            extendedInterval = theInterval;
        } else {
            extendedInterval = new SimpleInterval(theInterval.getContig(), Math.max(1, theInterval.getStart() + this.within_distance), theInterval.getEnd() + this.within_distance);
        }
        int count_basic_overlap = 0;
        int count_finer_overlap = 0;
        final Set<String> annotations = new LinkedHashSet<>();
        if (this.intervalTreeMap != null) {
            for (final Set<BedLine> bedLines : this.intervalTreeMap.getOverlapping(extendedInterval)) {
                for (final BedLine bedLine : bedLines) {
                    count_basic_overlap++;
                    if (!testFinerIntersection(theInterval, bedLine))
                        continue;
                    count_finer_overlap++;
                    final String newannot = this.bedJexlToString.apply(new BedJEXLContext(bedLine, ctx));
                    if (!StringUtil.isBlank(newannot)) {
                        annotations.add(VCFUtils.escapeInfoField(newannot));
                    }
                }
            }
        } else {
            try (CloseableIterator<BedLine> iter = this.bedReader.iterator(theInterval.getContig(), Math.max(0, theInterval.getStart() - 1), theInterval.getEnd() + 1)) {
                while (iter.hasNext()) {
                    final BedLine bedLine = iter.next();
                    if (!theInterval.contigsMatch(bedLine))
                        continue;
                    if (!extendedInterval.withinDistanceOf(bedLine, this.within_distance))
                        continue;
                    count_basic_overlap++;
                    if (!testFinerIntersection(theInterval, bedLine))
                        continue;
                    count_finer_overlap++;
                    final String newannot = this.bedJexlToString.apply(new BedJEXLContext(bedLine, ctx));
                    if (!StringUtil.isBlank(newannot))
                        annotations.add(VCFUtils.escapeInfoField(newannot));
                }
            } catch (final IOException ioe) {
                LOG.error(ioe);
                throw new RuntimeIOException(ioe);
            }
        }
        if (!annotations.isEmpty()) {
            vcb.attribute(infoHeader.getID(), annotations.toArray());
        }
        vcb.attribute(infoCountBasicOverlap.getID(), count_basic_overlap);
        vcb.attribute(infoCountFinerOverlap.getID(), count_finer_overlap);
        out.add(vcb.make());
    }
    out.close();
    return 0;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) VariantContext(htsjdk.variant.variantcontext.VariantContext) JexlToString(com.github.lindenb.jvarkit.jexl.JexlToString) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOException(java.io.IOException) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) VCFHeader(htsjdk.variant.vcf.VCFHeader)

Example 67 with SimpleInterval

use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.

the class VCFMerge method doWork.

@Override
public int doWork(final List<String> args) {
    final List<Path> userVcfFiles = new ArrayList<Path>();
    final Set<String> genotypeSampleNames = new TreeSet<String>();
    SAMSequenceDictionary dict = null;
    VariantContextWriter w = null;
    SortingCollection<VariantContext> array = null;
    CloseableIterator<VariantContext> iter = null;
    try {
        userVcfFiles.addAll(IOUtils.unrollPaths(args));
        final Set<String> fieldsSet = Arrays.stream(this.fieldsString.split("[,; ]")).collect(Collectors.toSet());
        final boolean with_ac = fieldsSet.contains(VCFConstants.ALLELE_COUNT_KEY);
        final boolean with_an = fieldsSet.contains(VCFConstants.ALLELE_NUMBER_KEY);
        final boolean with_af = fieldsSet.contains(VCFConstants.ALLELE_FREQUENCY_KEY);
        final boolean with_dp = fieldsSet.contains(VCFConstants.DEPTH_KEY);
        final boolean with_gq = fieldsSet.contains(VCFConstants.GENOTYPE_QUALITY_KEY);
        final boolean with_ad = fieldsSet.contains(VCFConstants.GENOTYPE_ALLELE_DEPTHS);
        final boolean with_pl = fieldsSet.contains(VCFConstants.GENOTYPE_PL_KEY);
        if (userVcfFiles.isEmpty()) {
            LOG.error("No input");
            return -1;
        }
        final boolean requireIndex = !StringUtils.isBlank(this.regionStr);
        for (final Path vcfFile : userVcfFiles) {
            try (VCFReader in = VCFReaderFactory.makeDefault().open(vcfFile, requireIndex)) {
                final VCFHeader header = in.getHeader();
                for (final String sn : header.getSampleNamesInOrder()) {
                    if (genotypeSampleNames.contains(sn)) {
                        LOG.error("duplicate sample name " + sn);
                        return -1;
                    }
                    genotypeSampleNames.add(sn);
                }
                final SAMSequenceDictionary dict1 = SequenceDictionaryUtils.extractRequired(header);
                if (dict == null) {
                    dict = dict1;
                } else {
                    SequenceUtil.assertSequenceDictionariesEqual(dict, dict1);
                }
            }
        }
        if (dict == null) {
            LOG.error("No sequence dictionary defined");
            return -1;
        }
        final SAMSequenceDictionary finalDict = dict;
        final Function<String, Integer> contig2tid = C -> {
            final int tid = finalDict.getSequenceIndex(C);
            if (tid == -1)
                throw new JvarkitException.ContigNotFoundInDictionary(C, finalDict);
            return tid;
        };
        final Comparator<String> compareContigs = (C1, C2) -> {
            if (C1.equals(C2))
                return 0;
            return contig2tid.apply(C1) - contig2tid.apply(C2);
        };
        final Comparator<VariantContext> compareChromPos = (V1, V2) -> {
            int i = compareContigs.compare(V1.getContig(), V2.getContig());
            if (i != 0)
                return i;
            return V1.getStart() - V2.getStart();
        };
        final Comparator<VariantContext> compareChromPosRef = (V1, V2) -> {
            int i = compareChromPos.compare(V1, V2);
            if (i != 0)
                return i;
            return V1.getReference().compareTo(V2.getReference());
        };
        final SimpleInterval rgn;
        final Predicate<VariantContext> accept;
        if (!StringUtil.isBlank(VCFMerge.this.regionStr)) {
            rgn = IntervalParserFactory.newInstance().dictionary(dict).enableWholeContig().make().apply(VCFMerge.this.regionStr).orElseThrow(IntervalParserFactory.exception(VCFMerge.this.regionStr));
            accept = (CTX) -> {
                return rgn.overlaps(CTX);
            };
        } else {
            accept = (VOL) -> true;
            rgn = null;
        }
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_KEY);
        if (with_dp)
            VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.DEPTH_KEY);
        if (with_gq)
            VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_QUALITY_KEY);
        if (with_pl)
            VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_PL_KEY);
        if (with_ad)
            VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_ALLELE_DEPTHS);
        if (with_ac)
            VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.ALLELE_COUNT_KEY);
        if (with_an)
            VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.ALLELE_NUMBER_KEY);
        if (with_af)
            VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.ALLELE_FREQUENCY_KEY);
        if (with_dp)
            VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.DEPTH_KEY);
        final VCFHeader mergedHeader = new VCFHeader(metaData, genotypeSampleNames);
        mergedHeader.setSequenceDictionary(dict);
        JVarkitVersion.getInstance().addMetaData(this, mergedHeader);
        array = SortingCollection.newInstance(VariantContext.class, new VCFRecordCodec(mergedHeader), compareChromPosRef, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        array.setDestructiveIteration(true);
        for (final Path vcfFile : userVcfFiles) {
            try (VCFReader in = VCFReaderFactory.makeDefault().open(vcfFile, requireIndex)) {
                try (CloseableIterator<VariantContext> lit = (in.isQueryable() && rgn != null ? in.query(rgn) : in.iterator())) {
                    while (lit.hasNext()) {
                        final VariantContext ctx = lit.next();
                        if (!accept.test(ctx))
                            continue;
                        array.add(new VariantContextBuilder(ctx).unfiltered().genotypes(ctx.getGenotypes().stream().filter(G -> G.isCalled()).map(G -> {
                            final GenotypeBuilder gb = new GenotypeBuilder(G);
                            gb.noAttributes();
                            return gb.make();
                        }).collect(Collectors.toList())).rmAttributes(new ArrayList<>(ctx.getAttributes().keySet())).make());
                    }
                }
            }
        }
        array.doneAdding();
        LOG.info("merging..." + userVcfFiles.size() + " vcfs");
        // create the context writer
        w = this.writingVariantsDelegate.open(outputFile);
        w.writeHeader(mergedHeader);
        iter = array.iterator();
        EqualRangeIterator<VariantContext> eqiter = new EqualRangeIterator<>(iter, compareChromPosRef);
        while (eqiter.hasNext()) {
            final List<VariantContext> row = eqiter.next();
            final VariantContext first = row.get(0);
            final List<Allele> alleles = new ArrayList<>();
            alleles.add(first.getReference());
            alleles.addAll(row.stream().flatMap(VC -> VC.getAlternateAlleles().stream()).filter(A -> !A.isNoCall()).collect(Collectors.toSet()));
            final VariantContextBuilder vcb = new VariantContextBuilder(null, first.getContig(), first.getStart(), first.getEnd(), alleles);
            final String id = row.stream().filter(V -> V.hasID()).map(ST -> ST.getID()).findFirst().orElse(null);
            if (!StringUtils.isBlank(id)) {
                vcb.id(id);
            }
            final Map<String, Genotype> sample2genotypes = new HashMap<>(genotypeSampleNames.size());
            final Set<String> remainingSamples = new HashSet<String>(genotypeSampleNames);
            int an = 0;
            int dp = -1;
            final Counter<Allele> ac = new Counter<>();
            for (final VariantContext ctx : row) {
                for (final Genotype gt : ctx.getGenotypes()) {
                    if (gt.isNoCall())
                        continue;
                    for (Allele a : gt.getAlleles()) {
                        ac.incr(a);
                        an++;
                    }
                    final GenotypeBuilder gb = new GenotypeBuilder(gt.getSampleName(), gt.getAlleles());
                    if (with_dp && gt.hasDP()) {
                        if (dp < 0)
                            dp = 0;
                        dp += gt.getDP();
                        gb.DP(gt.getDP());
                    }
                    if (with_gq && gt.hasGQ())
                        gb.GQ(gt.getGQ());
                    if (with_pl && gt.hasPL())
                        gb.PL(gt.getPL());
                    if (with_ad && gt.hasAD()) {
                        final int[] src_ad = gt.getAD();
                        final int[] dest_ad = new int[alleles.size()];
                        Arrays.fill(dest_ad, 0);
                        for (int i = 0; i < src_ad.length && i < ctx.getAlleles().size(); i++) {
                            final Allele a1 = ctx.getAlleles().get(i);
                            final int dest_idx = alleles.indexOf(a1);
                            if (dest_idx >= 0 && dest_idx < dest_ad.length) {
                                dest_ad[dest_idx] = src_ad[i];
                            }
                            gb.AD(dest_ad);
                        }
                    }
                    sample2genotypes.put(gt.getSampleName(), gb.make());
                }
            }
            remainingSamples.removeAll(sample2genotypes.keySet());
            for (String sampleName : remainingSamples) {
                final Genotype gt;
                if (this.useHomRefForUnknown) {
                    final List<Allele> list = new ArrayList<>(ploidy);
                    for (int i = 0; i < ploidy; i++) list.add(first.getReference());
                    an += ploidy;
                    gt = GenotypeBuilder.create(sampleName, list);
                } else {
                    gt = GenotypeBuilder.createMissing(sampleName, this.ploidy);
                }
                sample2genotypes.put(sampleName, gt);
            }
            if (with_an)
                vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, an);
            if (with_ac)
                vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, alleles.subList(1, alleles.size()).stream().mapToInt(A -> (int) ac.count(A)).toArray());
            if (with_af && an > 0) {
                final double finalAn = an;
                vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, alleles.subList(1, alleles.size()).stream().mapToDouble(A -> ac.count(A) / finalAn).toArray());
            }
            if (with_dp && dp >= 0) {
                vcb.attribute(VCFConstants.DEPTH_KEY, dp);
            }
            vcb.genotypes(sample2genotypes.values());
            w.add(vcb.make());
        }
        eqiter.close();
        CloserUtil.close(w);
        w = null;
        array.cleanup();
        array = null;
        CloserUtil.close(iter);
        iter = null;
        return 0;
    } catch (Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(w);
        CloserUtil.close(iter);
        if (array != null)
            array.cleanup();
    }
}
Also used : WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) VCFRecordCodec(htsjdk.variant.vcf.VCFRecordCodec) Arrays(java.util.Arrays) SequenceUtil(htsjdk.samtools.util.SequenceUtil) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashMap(java.util.HashMap) IntervalParserFactory(com.github.lindenb.jvarkit.samtools.util.IntervalParserFactory) Function(java.util.function.Function) TreeSet(java.util.TreeSet) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) StringUtil(htsjdk.samtools.util.StringUtil) Map(java.util.Map) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) VCFConstants(htsjdk.variant.vcf.VCFConstants) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) Counter(com.github.lindenb.jvarkit.util.Counter) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) SortingCollection(htsjdk.samtools.util.SortingCollection) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Predicate(java.util.function.Predicate) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFReader(htsjdk.variant.vcf.VCFReader) Set(java.util.Set) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VariantContext(htsjdk.variant.variantcontext.VariantContext) Comparator(java.util.Comparator) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VCFRecordCodec(htsjdk.variant.vcf.VCFRecordCodec) Counter(com.github.lindenb.jvarkit.util.Counter) TreeSet(java.util.TreeSet) VCFReader(htsjdk.variant.vcf.VCFReader) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) Path(java.nio.file.Path) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder)

Aggregations

SimpleInterval (com.github.lindenb.jvarkit.samtools.util.SimpleInterval)67 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)46 List (java.util.List)45 Path (java.nio.file.Path)44 ArrayList (java.util.ArrayList)44 Locatable (htsjdk.samtools.util.Locatable)42 Parameter (com.beust.jcommander.Parameter)41 Program (com.github.lindenb.jvarkit.util.jcommander.Program)41 Logger (com.github.lindenb.jvarkit.util.log.Logger)41 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)37 Collectors (java.util.stream.Collectors)37 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)35 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)34 CloserUtil (htsjdk.samtools.util.CloserUtil)34 Set (java.util.Set)34 ContigNameConverter (com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter)33 CloseableIterator (htsjdk.samtools.util.CloseableIterator)32 SAMFileHeader (htsjdk.samtools.SAMFileHeader)31 SamReader (htsjdk.samtools.SamReader)31 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)30