Search in sources :

Example 1 with SimpleInterval

use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.

the class Transcript method getCdsInterval.

/**
 *return genomic interval spanning between codon start and codon stop if they both exists
 */
public default Optional<Locatable> getCdsInterval() {
    if (!hasCodonStartDefined() || !hasCodonStopDefined())
        return Optional.empty();
    int x1 = this.getEnd();
    int x2 = this.getStart();
    Locatable codon = getCodonStart().get();
    x1 = Math.min(codon.getStart(), x1);
    x2 = Math.max(codon.getEnd(), x2);
    codon = getCodonStop().get();
    x1 = Math.min(codon.getStart(), x1);
    x2 = Math.max(codon.getEnd(), x2);
    return Optional.of(new SimpleInterval(getContig(), x1, x2));
}
Also used : SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) Locatable(htsjdk.samtools.util.Locatable)

Example 2 with SimpleInterval

use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.

the class SlidingWindowIterator method advance.

@Override
protected Entry<? extends Locatable, List<V>> advance() {
    for (; ; ) {
        if (!to_be_released.isEmpty()) {
            final Window<V> w = to_be_released.pollFirst();
            return new AbstractMap.SimpleEntry<>(w.interval, w.variants);
        }
        final V ctx = this.delegate.hasNext() ? this.delegate.next() : null;
        // new contig?
        if (ctx == null || !ctx.getContig().equals(prevContig)) {
            to_be_released.addAll(this.buffer);
            // EOF
            if (ctx == null && to_be_released.isEmpty())
                return null;
            interval2win.clear();
            buffer.clear();
        }
        // because to_be_released might be not empty
        if (ctx == null)
            continue;
        this.prevContig = ctx.getContig();
        // remove previous windows
        int i = 0;
        while (i < buffer.size()) {
            final Window<V> w = buffer.get(i);
            if (w.getEnd() < ctx.getStart()) {
                to_be_released.add(w);
                buffer.remove(i);
                interval2win.remove(w.interval);
            } else {
                i++;
            }
        }
        prevContig = ctx.getContig();
        int x1 = ctx.getStart() - ctx.getStart() % this.window_shift;
        while (x1 - this.window_shift + this.window_size >= ctx.getStart()) {
            x1 -= this.window_shift;
        }
        for (; ; ) {
            final SimpleInterval r = new SimpleInterval(ctx.getContig(), Math.max(1, x1), Math.max(1, x1 + this.window_size));
            if (r.getStart() > ctx.getEnd())
                break;
            if (r.overlaps(ctx)) {
                Window<V> w = interval2win.get(r);
                if (w == null) {
                    w = new Window<V>(r);
                    interval2win.put(r, w);
                    buffer.add(w);
                }
                w.variants.add(ctx);
            }
            x1 += this.window_shift;
        }
    }
}
Also used : SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval)

Example 3 with SimpleInterval

use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.

the class VcfFisherCombinatorics method doWork.

@Override
public int doWork(final List<String> args) {
    try {
        final Pedigree pedigree = new PedigreeParser().parse(this.pedigreeFile);
        final IntervalTreeMap<List<GeneInfo>> geneInfoMap = new IntervalTreeMap<>();
        final Predicate<Genotype> isGtCnv = G -> G.isHet() || G.isHomVar();
        try (VCFIterator vcfIter = super.openVCFIterator(super.oneFileOrNull(args))) {
            final VCFHeader header = vcfIter.getHeader();
            final ContigNameConverter ctgConverter = ContigNameConverter.fromOneDictionary(SequenceDictionaryUtils.extractRequired(header));
            if (!header.hasGenotypingData()) {
                LOG.error("No Genotype data in " + header);
                return -1;
            }
            final Map<String, Integer> sample2index = header.getSampleNameToOffset();
            final List<Integer> casesIdx = pedigree.getSamplesInVcfHeader(header).filter(S -> S.isAffected()).map(S -> S.getId()).map(S -> sample2index.get(S)).sorted().collect(Collectors.toList());
            LOG.info("cases N=" + casesIdx.size());
            if (casesIdx.isEmpty()) {
                LOG.error("No affected/cases sample in the input VCF");
                return -1;
            }
            final List<Integer> controlsIdx = pedigree.getSamplesInVcfHeader(header).filter(S -> S.isUnaffected()).map(S -> S.getId()).map(S -> sample2index.get(S)).sorted().collect(Collectors.toList());
            LOG.info("controls N=" + controlsIdx.size());
            if (controlsIdx.isEmpty()) {
                LOG.error("No unaffected/controls sample in the input VCF");
                return -1;
            }
            final Predicate<VariantContext> acceptCtx = V -> {
                if (discard_bnd && V.getAttributeAsString(VCFConstants.SVTYPE, "").equals("BND"))
                    return false;
                if (discard_filtered && V.isFiltered())
                    return false;
                if (max_af < 1.0) {
                    final Iterator<Integer> it = Stream.concat(controlsIdx.stream(), casesIdx.stream()).iterator();
                    int ac = 0;
                    int an = 0;
                    while (it.hasNext()) {
                        switch(V.getGenotype(it.next()).getType()) {
                            case HOM_VAR:
                                ac += 2;
                                an += 2;
                                break;
                            case HOM_REF:
                                ac += 0;
                                an += 2;
                                break;
                            case HET:
                                ac += 1;
                                an += 2;
                                break;
                            default:
                                break;
                        }
                    }
                    if (an == 0)
                        return false;
                    if (ac / (double) an > max_af)
                        return false;
                }
                return true;
            };
            try (BedLineReader br = new BedLineReader(this.bedPath)) {
                while (br.hasNext()) {
                    final BedLine bed = br.next();
                    final String ctg = ctgConverter.apply(bed.getContig());
                    if (StringUtil.isBlank(ctg))
                        continue;
                    final GeneInfo geneInfo = new GeneInfo();
                    geneInfo.casesflags = new BitSet(casesIdx.size());
                    geneInfo.controlsflags = new BitSet(controlsIdx.size());
                    geneInfo.contig = ctg;
                    geneInfo.name = bed.getOrDefault(3, "undefined");
                    geneInfo.parts.add(new SimpleInterval(bed).renameContig(ctg));
                    final Interval key = new Interval(geneInfo);
                    List<GeneInfo> L = geneInfoMap.get(key);
                    if (L == null) {
                        L = new ArrayList<>();
                        geneInfoMap.put(key, L);
                    }
                    L.add(geneInfo);
                }
            }
            if (geneInfoMap.isEmpty()) {
                LOG.error("no gene found in " + this.bedPath);
                return -1;
            }
            LOG.info("reading variants...");
            while (vcfIter.hasNext()) {
                final VariantContext ctx = vcfIter.next();
                if (!acceptCtx.test(ctx))
                    continue;
                for (List<GeneInfo> gil : geneInfoMap.getOverlapping(ctx)) {
                    for (GeneInfo gi : gil) {
                        for (int y = 0; y < casesIdx.size(); ++y) {
                            final Genotype gt = ctx.getGenotype(casesIdx.get(y));
                            if (!isGtCnv.test(gt))
                                continue;
                            gi.casesflags.set(y);
                        }
                        for (int y = 0; y < controlsIdx.size(); ++y) {
                            final Genotype gt = ctx.getGenotype(controlsIdx.get(y));
                            if (!isGtCnv.test(gt))
                                continue;
                            gi.controlsflags.set(y);
                        }
                    }
                }
            }
            /* remove Genes without variant , count sample per gene*/
            for (List<GeneInfo> gil : geneInfoMap.values()) {
                gil.removeIf(GI -> GI.casesflags.nextSetBit(0) == -1 && GI.controlsflags.nextSetBit(0) == -1);
            }
            // end remove
            /* load bed file */
            final List<GeneInfo> geneList = geneInfoMap.values().stream().filter(L -> !L.isEmpty()).flatMap(L -> L.stream()).sorted().collect(Collectors.toList());
            if (geneList.isEmpty()) {
                LOG.error("no gene found in " + this.bedPath);
                return -1;
            }
            LOG.info("N Genes=" + geneList.size());
            Solution best = null;
            for (int i = 2; i < Math.min(this.max_num_genes, geneList.size()); i++) {
                LOG.info("sarting loop from " + i);
                best = recursion(geneList, casesIdx, controlsIdx, new ArrayList<>(), i, best);
            }
            try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
                pw.println(best);
                pw.flush();
            }
        }
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
    }
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFIterator(htsjdk.variant.vcf.VCFIterator) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) ArrayList(java.util.ArrayList) BedLineReader(com.github.lindenb.jvarkit.bed.BedLineReader) Interval(htsjdk.samtools.util.Interval) StringUtil(htsjdk.samtools.util.StringUtil) FisherExactTest(com.github.lindenb.jvarkit.math.stats.FisherExactTest) PedigreeParser(com.github.lindenb.jvarkit.pedigree.PedigreeParser) Map(java.util.Map) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Path(java.nio.file.Path) VCFConstants(htsjdk.variant.vcf.VCFConstants) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) Locatable(htsjdk.samtools.util.Locatable) Iterator(java.util.Iterator) Pedigree(com.github.lindenb.jvarkit.pedigree.Pedigree) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Predicate(java.util.function.Predicate) Logger(com.github.lindenb.jvarkit.util.log.Logger) Collectors(java.util.stream.Collectors) List(java.util.List) Stream(java.util.stream.Stream) FractionConverter(com.github.lindenb.jvarkit.jcommander.converter.FractionConverter) VariantContext(htsjdk.variant.variantcontext.VariantContext) BitSet(java.util.BitSet) Collections(java.util.Collections) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) BedLineReader(com.github.lindenb.jvarkit.bed.BedLineReader) PedigreeParser(com.github.lindenb.jvarkit.pedigree.PedigreeParser) VCFIterator(htsjdk.variant.vcf.VCFIterator) Iterator(java.util.Iterator) ArrayList(java.util.ArrayList) List(java.util.List) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) VCFHeader(htsjdk.variant.vcf.VCFHeader) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) VCFIterator(htsjdk.variant.vcf.VCFIterator) PrintWriter(java.io.PrintWriter) BitSet(java.util.BitSet) Genotype(htsjdk.variant.variantcontext.Genotype) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) Pedigree(com.github.lindenb.jvarkit.pedigree.Pedigree) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Interval(htsjdk.samtools.util.Interval) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval)

Example 4 with SimpleInterval

use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.

the class FindGVCFsBlocks method doWork.

@SuppressWarnings("resource")
@Override
public int doWork(final List<String> args) {
    Path tmpBedFile0 = null;
    Path tmpBedFile1 = null;
    try {
        if (this.bedPath != null) {
            IOUtil.assertFileIsReadable(this.bedPath);
        }
        final List<Path> inputs = IOUtils.unrollPaths(args);
        if (inputs.isEmpty()) {
            LOG.error("input missing");
            return -1;
        }
        if (this.outputFile != null) {
            final String fname = this.outputFile.getFileName().toString();
            if (!fname.endsWith(FileExtensions.INTERVAL_LIST) && !fname.endsWith(FileExtensions.COMPRESSED_INTERVAL_LIST)) {
                LOG.error("Output should end with " + FileExtensions.INTERVAL_LIST + " or " + FileExtensions.COMPRESSED_INTERVAL_LIST);
                return -1;
            }
        }
        if (this.tmpDir == null && this.outputFile != null) {
            this.tmpDir = this.outputFile.getParent();
        }
        if (this.tmpDir == null) {
            this.tmpDir = IOUtils.getDefaultTempDir();
        }
        IOUtil.assertDirectoryIsWritable(this.tmpDir);
        tmpBedFile0 = Files.createTempFile(this.tmpDir, "tmp.", ".bed");
        tmpBedFile1 = Files.createTempFile(this.tmpDir, "tmp.", ".bed");
        SAMSequenceDictionary dict = null;
        final long initMilliSec = System.currentTimeMillis();
        for (int i = 0; i < inputs.size(); i++) {
            final long startMilliSec = System.currentTimeMillis();
            LOG.info(inputs.get(i) + " " + (i + 1) + "/" + inputs.size());
            try (GVCFOrIntervalIterator r0 = openInput(inputs.get(i))) {
                if (dict != null) {
                    SequenceUtil.assertSequenceDictionariesEqual(dict, r0.getSAMSequenceDictionary());
                } else {
                    dict = r0.getSAMSequenceDictionary();
                }
                long count_variants = 0L;
                try (IntervalListWriter pw = new IntervalListWriter(tmpBedFile0, dict)) {
                    /* first VCF , just convert to bed */
                    if (i == 0) {
                        while (r0.hasNext()) {
                            final Locatable loc = r0.next();
                            pw.add(loc);
                            count_variants++;
                        }
                    } else /* merge previous bed with current VCF using INFO/END */
                    {
                        Locatable start0 = null;
                        Locatable start1 = null;
                        try (IntervalListIterator r1 = new IntervalListIterator(tmpBedFile1)) {
                            PeekableIterator<Locatable> peek0 = new PeekableIterator<>(r0);
                            PeekableIterator<Locatable> peek1 = new PeekableIterator<>(r1);
                            while (peek0.hasNext() && peek1.hasNext()) {
                                final Locatable loc0 = peek0.peek();
                                final Locatable loc1 = peek1.peek();
                                if (!loc0.contigsMatch(loc1)) {
                                    throw new IllegalStateException("unexpected: not the same contigs " + loc0 + " " + loc1);
                                }
                                if (start0 == null)
                                    start0 = loc0;
                                if (start1 == null)
                                    start1 = loc1;
                                final int end0 = loc0.getEnd();
                                final int end1 = loc1.getEnd();
                                if (end0 < end1) {
                                    peek0.next();
                                    continue;
                                } else if (end0 > end1) {
                                    peek1.next();
                                    continue;
                                } else {
                                    /* end0==end1 */
                                    pw.add(new SimpleInterval(loc0.getContig(), (Math.min(start0.getStart(), start1.getStart())), loc0.getEnd()));
                                    count_variants++;
                                    // consumme
                                    peek0.next();
                                    // consumme
                                    peek1.next();
                                    start0 = null;
                                    start1 = null;
                                }
                            }
                            if (lenient_discordant_end) {
                                while (peek0.hasNext()) {
                                    final Locatable loc = peek0.next();
                                    LOG.warn("extra interval 0 : " + loc);
                                }
                                while (peek1.hasNext()) {
                                    final Locatable loc = peek1.next();
                                    LOG.warn("extra interval 1 : " + loc);
                                }
                            } else {
                                if (peek0.hasNext())
                                    throw new IllegalStateException("peek0 has Next ? " + peek0.next() + " use --lenient ?");
                                if (peek1.hasNext())
                                    throw new IllegalStateException("peek1 has Next ? " + peek1.next() + " use --lenient ?");
                            }
                            peek0.close();
                            peek1.close();
                        }
                    }
                    final long millisecPerVcf = (System.currentTimeMillis() - initMilliSec) / (i + 1L);
                    LOG.info("N=" + count_variants + ". That took: " + StringUtils.niceDuration(System.currentTimeMillis() - startMilliSec) + " Remains: " + StringUtils.niceDuration((inputs.size() - (i + 1)) * millisecPerVcf));
                }
                // end writer
                Files.deleteIfExists(tmpBedFile1);
                Files.move(tmpBedFile0, tmpBedFile1);
            }
        }
        final IntervalTreeMap<Boolean> intervalTreeMap;
        if (this.bedPath != null) {
            try (BedLineReader blr = new BedLineReader(this.bedPath)) {
                intervalTreeMap = blr.toIntervalTreeMap(BED -> Boolean.TRUE);
            }
        } else {
            intervalTreeMap = null;
        }
        try (IntervalListWriter w = new IntervalListWriter(this.outputFile, dict)) {
            try (IntervalListIterator r1 = new IntervalListIterator(tmpBedFile1)) {
                final PeekableIterator<Locatable> peek1 = new PeekableIterator<>(r1);
                while (peek1.hasNext()) {
                    Locatable loc = peek1.next();
                    if (intervalTreeMap != null && !intervalTreeMap.containsOverlapping(loc)) {
                        continue;
                    }
                    while (this.min_block_size > 0 && peek1.hasNext()) {
                        final Locatable loc2 = peek1.peek();
                        if (!loc2.contigsMatch(loc))
                            break;
                        if (CoordMath.getLength(loc.getStart(), loc2.getEnd()) > this.min_block_size)
                            break;
                        loc = new SimpleInterval(loc.getContig(), loc.getStart(), loc2.getEnd());
                        // consumme loc2
                        peek1.next();
                    }
                    w.add(loc);
                }
                peek1.close();
            }
            Files.deleteIfExists(tmpBedFile1);
        }
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        if (tmpBedFile0 != null)
            try {
                Files.deleteIfExists(tmpBedFile0);
            } catch (Throwable err) {
            }
        if (tmpBedFile1 != null)
            try {
                Files.deleteIfExists(tmpBedFile1);
            } catch (Throwable err) {
            }
    }
}
Also used : Path(java.nio.file.Path) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) SequenceUtil(htsjdk.samtools.util.SequenceUtil) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) IOUtil(htsjdk.samtools.util.IOUtil) VCFHeader(htsjdk.variant.vcf.VCFHeader) SAMFileHeader(htsjdk.samtools.SAMFileHeader) SAMTextHeaderCodec(htsjdk.samtools.SAMTextHeaderCodec) BedLineReader(com.github.lindenb.jvarkit.bed.BedLineReader) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) OutputStreamWriter(java.io.OutputStreamWriter) Path(java.nio.file.Path) PeekableIterator(htsjdk.samtools.util.PeekableIterator) VCFConstants(htsjdk.variant.vcf.VCFConstants) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) Locatable(htsjdk.samtools.util.Locatable) Files(java.nio.file.Files) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) BufferedWriter(java.io.BufferedWriter) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IntervalListCodec(htsjdk.tribble.IntervalList.IntervalListCodec) VCFReader(htsjdk.variant.vcf.VCFReader) AbstractCloseableIterator(com.github.lindenb.jvarkit.iterator.AbstractCloseableIterator) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) BufferedLineReader(htsjdk.samtools.util.BufferedLineReader) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) VariantContextComparator(htsjdk.variant.variantcontext.VariantContextComparator) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) FileExtensions(htsjdk.samtools.util.FileExtensions) Closeable(java.io.Closeable) CoordMath(htsjdk.samtools.util.CoordMath) VariantContext(htsjdk.variant.variantcontext.VariantContext) BufferedReader(java.io.BufferedReader) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) BedLineReader(com.github.lindenb.jvarkit.bed.BedLineReader) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) PeekableIterator(htsjdk.samtools.util.PeekableIterator) Locatable(htsjdk.samtools.util.Locatable)

Example 5 with SimpleInterval

use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.

the class ValidateCnv method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.extendFactor <= 0) {
        LOG.error("bad extend factor " + this.extendFactor);
        return -1;
    }
    if (this.treshold < 0 || this.treshold >= 0.25) {
        LOG.error("Bad treshold 0 < " + this.treshold + " >=0.25 ");
        return -1;
    }
    final Map<String, BamInfo> sample2bam = new HashMap<>();
    VariantContextWriter out = null;
    Iterator<? extends Locatable> iterIn = null;
    try {
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.rererencePath);
        final CRAMReferenceSource cramReferenceSource = new ReferenceSource(this.rererencePath);
        final List<Path> bamPaths = IOUtils.unrollPaths(this.bamFiles);
        final String input = oneFileOrNull(args);
        if (input == null) {
            iterIn = IntervalListProvider.empty().dictionary(dict).skipUnknownContigs().fromInputStream(stdin(), "bed").iterator();
        } else {
            final IntervalListProvider ilp = IntervalListProvider.from(input).setVariantPredicate(CTX -> {
                if (CTX.isSNP())
                    return false;
                final String svType = CTX.getAttributeAsString(VCFConstants.SVTYPE, "");
                if (svType != null && (svType.equals("INV") || svType.equals("BND")))
                    return false;
                return true;
            }).dictionary(dict).skipUnknownContigs();
            iterIn = ilp.stream().iterator();
        }
        /* register each bam */
        for (final Path p2 : bamPaths) {
            final BamInfo bi = new BamInfo(p2, cramReferenceSource);
            if (sample2bam.containsKey(bi.sampleName)) {
                LOG.error("sample " + bi.sampleName + " specified twice.");
                bi.close();
                return -1;
            }
            sample2bam.put(bi.sampleName, bi);
        }
        if (sample2bam.isEmpty()) {
            LOG.error("no bam was defined");
            return -1;
        }
        final Set<VCFHeaderLine> metadata = new HashSet<>();
        final VCFInfoHeaderLine infoSVSamples = new VCFInfoHeaderLine("N_SAMPLES", 1, VCFHeaderLineType.Integer, "Number of Samples that could carry a SV");
        metadata.add(infoSVSamples);
        final VCFInfoHeaderLine infoSvLen = new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "SV length");
        metadata.add(infoSvLen);
        final BiFunction<String, String, VCFFormatHeaderLine> makeFmt = (TAG, DESC) -> new VCFFormatHeaderLine(TAG, 1, VCFHeaderLineType.Integer, DESC);
        final VCFFormatHeaderLine formatCN = new VCFFormatHeaderLine("CN", 1, VCFHeaderLineType.Float, "normalized copy-number. Treshold was " + this.treshold);
        metadata.add(formatCN);
        final VCFFormatHeaderLine nReadsSupportingSv = makeFmt.apply("RSD", "number of split reads supporting SV.");
        metadata.add(nReadsSupportingSv);
        final VCFFilterHeaderLine filterAllDel = new VCFFilterHeaderLine("ALL_DEL", "number of samples greater than 1 and all are deletions");
        metadata.add(filterAllDel);
        final VCFFilterHeaderLine filterAllDup = new VCFFilterHeaderLine("ALL_DUP", "number of samples  greater than  1 and all are duplication");
        metadata.add(filterAllDup);
        final VCFFilterHeaderLine filterNoSV = new VCFFilterHeaderLine("NO_SV", "There is no DUP or DEL in this variant");
        metadata.add(filterNoSV);
        final VCFFilterHeaderLine filterHomDel = new VCFFilterHeaderLine("HOM_DEL", "There is one Homozygous deletion.");
        metadata.add(filterHomDel);
        final VCFFilterHeaderLine filterHomDup = new VCFFilterHeaderLine("HOM_DUP", "There is one Homozygous duplication.");
        metadata.add(filterHomDup);
        VCFStandardHeaderLines.addStandardFormatLines(metadata, true, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_KEY, VCFConstants.GENOTYPE_FILTER_KEY, VCFConstants.GENOTYPE_QUALITY_KEY);
        VCFStandardHeaderLines.addStandardInfoLines(metadata, true, VCFConstants.DEPTH_KEY, VCFConstants.END_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY, VCFConstants.ALLELE_NUMBER_KEY);
        final VCFHeader header = new VCFHeader(metadata, sample2bam.keySet());
        if (dict != null)
            header.setSequenceDictionary(dict);
        JVarkitVersion.getInstance().addMetaData(this, header);
        final ProgressFactory.Watcher<VariantContext> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
        out = this.writingVariantsDelegate.dictionary(dict).open(this.outputFile);
        out.writeHeader(header);
        final Allele DUP_ALLELE = Allele.create("<DUP>", false);
        final Allele DEL_ALLELE = Allele.create("<DEL>", false);
        final Allele REF_ALLELE = Allele.create("N", true);
        while (iterIn.hasNext()) {
            final Locatable ctx = iterIn.next();
            if (ctx == null)
                continue;
            final SAMSequenceRecord ssr = dict.getSequence(ctx.getContig());
            if (ssr == null || ctx.getStart() >= ssr.getSequenceLength())
                continue;
            final int svLen = ctx.getLengthOnReference();
            if (svLen < this.min_abs_sv_size)
                continue;
            if (svLen > this.max_abs_sv_size)
                continue;
            int n_samples_with_cnv = 0;
            final SimplePosition breakPointLeft = new SimplePosition(ctx.getContig(), ctx.getStart());
            final SimplePosition breakPointRight = new SimplePosition(ctx.getContig(), ctx.getEnd());
            final int extend = 1 + (int) (svLen * this.extendFactor);
            final int leftPos = Math.max(1, breakPointLeft.getPosition() - extend);
            final int array_mid_start = breakPointLeft.getPosition() - leftPos;
            final int array_mid_end = breakPointRight.getPosition() - leftPos;
            final int rightPos = Math.min(breakPointRight.getPosition() + extend, ssr.getSequenceLength());
            final VariantContextBuilder vcb = new VariantContextBuilder();
            vcb.chr(ctx.getContig());
            vcb.start(ctx.getStart());
            vcb.stop(ctx.getEnd());
            vcb.attribute(VCFConstants.END_KEY, ctx.getEnd());
            final Set<Allele> alleles = new HashSet<>();
            alleles.add(REF_ALLELE);
            int count_dup = 0;
            int count_del = 0;
            int an = 0;
            final Counter<Allele> countAlleles = new Counter<>();
            final List<Genotype> genotypes = new ArrayList<>(sample2bam.size());
            Double badestGQ = null;
            final double[] raw_coverage = new double[CoordMath.getLength(leftPos, rightPos)];
            for (final String sampleName : sample2bam.keySet()) {
                final BamInfo bi = sample2bam.get(sampleName);
                Arrays.fill(raw_coverage, 0.0);
                int n_reads_supporting_sv = 0;
                try (CloseableIterator<SAMRecord> iter2 = bi.samReader.queryOverlapping(ctx.getContig(), leftPos, rightPos)) {
                    while (iter2.hasNext()) {
                        final SAMRecord rec = iter2.next();
                        if (!SAMRecordDefaultFilter.accept(rec, this.min_mapq))
                            continue;
                        final Cigar cigar = rec.getCigar();
                        if (cigar == null || cigar.isEmpty())
                            continue;
                        // any clip supporting deletion ?
                        boolean read_supports_cnv = false;
                        final int breakpoint_distance = 10;
                        // any clip on left ?
                        if (cigar.isLeftClipped() && rec.getUnclippedStart() < rec.getAlignmentStart() && new SimpleInterval(ctx.getContig(), rec.getUnclippedStart(), rec.getAlignmentStart()).withinDistanceOf(breakPointLeft, breakpoint_distance)) {
                            read_supports_cnv = true;
                        }
                        // any clip on right ?
                        if (!read_supports_cnv && cigar.isRightClipped() && rec.getAlignmentEnd() < rec.getUnclippedEnd() && new SimpleInterval(ctx.getContig(), rec.getAlignmentEnd(), rec.getUnclippedEnd()).withinDistanceOf(breakPointRight, breakpoint_distance)) {
                            read_supports_cnv = true;
                        }
                        if (read_supports_cnv) {
                            n_reads_supporting_sv++;
                        }
                        int ref = rec.getStart();
                        for (final CigarElement ce : cigar) {
                            final CigarOperator op = ce.getOperator();
                            if (op.consumesReferenceBases()) {
                                if (op.consumesReadBases()) {
                                    for (int x = 0; x < ce.getLength() && ref + x - leftPos < raw_coverage.length; ++x) {
                                        final int p = ref + x - leftPos;
                                        if (p < 0 || p >= raw_coverage.length)
                                            continue;
                                        raw_coverage[p]++;
                                    }
                                }
                                ref += ce.getLength();
                            }
                        }
                    }
                // end while iter record
                }
                // end try query for iterator
                // test for great difference between DP left and DP right
                final OptionalDouble medianDepthLeft = Percentile.median().evaluate(raw_coverage, 0, array_mid_start);
                final OptionalDouble medianDepthRight = Percentile.median().evaluate(raw_coverage, array_mid_end, raw_coverage.length - array_mid_end);
                // any is just too low
                if (!medianDepthLeft.isPresent() || medianDepthLeft.getAsDouble() < this.min_depth || !medianDepthRight.isPresent() || medianDepthRight.getAsDouble() < this.min_depth) {
                    final Genotype gt2 = new GenotypeBuilder(sampleName, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).filter("LowDp").make();
                    genotypes.add(gt2);
                    continue;
                }
                final double difference_factor = 2.0;
                // even if a value is divided , it remains greater than the other size
                if (medianDepthLeft.getAsDouble() / difference_factor > medianDepthRight.getAsDouble() || medianDepthRight.getAsDouble() / difference_factor > medianDepthLeft.getAsDouble()) {
                    final Genotype gt2 = new GenotypeBuilder(sampleName, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).filter("DiffLR").make();
                    genotypes.add(gt2);
                    continue;
                }
                // run median to smooth spline
                final double[] smoothed_cov = new RunMedian(RunMedian.getTurlachSize(raw_coverage.length)).apply(raw_coverage);
                final double[] bounds_cov = IntStream.concat(IntStream.range(0, array_mid_start), IntStream.range(array_mid_end, smoothed_cov.length)).mapToDouble(IDX -> raw_coverage[IDX]).toArray();
                final OptionalDouble optMedianBound = Percentile.median().evaluate(bounds_cov);
                if (!optMedianBound.isPresent() || optMedianBound.getAsDouble() == 0) {
                    final Genotype gt2 = new GenotypeBuilder(sampleName, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).filter("MedZero").make();
                    genotypes.add(gt2);
                    continue;
                }
                final double medianBound = optMedianBound.getAsDouble();
                // divide coverage per medianBound
                final double[] normalized_mid_coverage = new double[array_mid_end - array_mid_start];
                for (int i = 0; i < normalized_mid_coverage.length; ++i) {
                    normalized_mid_coverage[i] = smoothed_cov[array_mid_start + i] / medianBound;
                }
                final double normDepth = Percentile.median().evaluate(normalized_mid_coverage).getAsDouble();
                final boolean is_sv;
                final boolean is_hom_deletion = Math.abs(normDepth - 0.0) <= this.treshold;
                final boolean is_het_deletion = Math.abs(normDepth - 0.5) <= this.treshold || (!is_hom_deletion && normDepth <= 0.5);
                final boolean is_hom_dup = Math.abs(normDepth - 2.0) <= this.treshold || normDepth > 2.0;
                final boolean is_het_dup = Math.abs(normDepth - 1.5) <= this.treshold || (!is_hom_dup && normDepth >= 1.5);
                final boolean is_ref = Math.abs(normDepth - 1.0) <= this.treshold;
                final double theoritical_depth;
                final GenotypeBuilder gb;
                if (is_ref) {
                    gb = new GenotypeBuilder(sampleName, Arrays.asList(REF_ALLELE, REF_ALLELE));
                    is_sv = false;
                    theoritical_depth = 1.0;
                    an += 2;
                } else if (is_het_deletion) {
                    gb = new GenotypeBuilder(sampleName, Arrays.asList(REF_ALLELE, DEL_ALLELE));
                    alleles.add(DEL_ALLELE);
                    is_sv = true;
                    theoritical_depth = 0.5;
                    count_del++;
                    an += 2;
                    countAlleles.incr(DEL_ALLELE);
                } else if (is_hom_deletion) {
                    gb = new GenotypeBuilder(sampleName, Arrays.asList(DEL_ALLELE, DEL_ALLELE));
                    alleles.add(DEL_ALLELE);
                    vcb.filter(filterHomDel.getID());
                    is_sv = true;
                    theoritical_depth = 0.0;
                    count_del++;
                    an += 2;
                    countAlleles.incr(DEL_ALLELE, 2);
                } else if (is_het_dup) {
                    gb = new GenotypeBuilder(sampleName, Arrays.asList(REF_ALLELE, DUP_ALLELE));
                    alleles.add(DUP_ALLELE);
                    is_sv = true;
                    theoritical_depth = 1.5;
                    count_dup++;
                    an += 2;
                    countAlleles.incr(DUP_ALLELE);
                } else if (is_hom_dup) {
                    gb = new GenotypeBuilder(sampleName, Arrays.asList(DUP_ALLELE, DUP_ALLELE));
                    alleles.add(DUP_ALLELE);
                    vcb.filter(filterHomDup.getID());
                    is_sv = true;
                    theoritical_depth = 2.0;
                    count_dup++;
                    an += 2;
                    countAlleles.incr(DUP_ALLELE, 2);
                } else {
                    gb = new GenotypeBuilder(sampleName, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).filter("Ambigous");
                    is_sv = false;
                    theoritical_depth = 1.0;
                }
                if (is_sv) {
                    n_samples_with_cnv++;
                }
                double gq = Math.abs(theoritical_depth - normDepth);
                gq = Math.min(0.5, gq);
                gq = gq * gq;
                gq = gq / 0.25;
                gq = 99 * (1.0 - gq);
                gb.GQ((int) gq);
                if (badestGQ == null || badestGQ.compareTo(gq) > 0) {
                    badestGQ = gq;
                }
                gb.attribute(formatCN.getID(), normDepth);
                gb.attribute(nReadsSupportingSv.getID(), n_reads_supporting_sv);
                genotypes.add(gb.make());
            }
            vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, an);
            final List<Allele> orderedAlleles = new ArrayList<>(alleles);
            Collections.sort(orderedAlleles);
            if (orderedAlleles.size() > 1) {
                final List<Integer> acL = new ArrayList<>();
                final List<Double> afL = new ArrayList<>();
                for (int i = 1; i < orderedAlleles.size(); i++) {
                    final Allele a = orderedAlleles.get(i);
                    final int c = (int) countAlleles.count(a);
                    acL.add(c);
                    if (an > 0)
                        afL.add(c / (double) an);
                }
                vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, acL);
                if (an > 0)
                    vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, afL);
            }
            // if(alleles.size()<=1) continue;
            vcb.alleles(orderedAlleles);
            vcb.noID();
            vcb.genotypes(genotypes);
            vcb.attribute(infoSVSamples.getID(), n_samples_with_cnv);
            vcb.attribute(infoSvLen.getID(), svLen);
            if (count_dup == sample2bam.size() && sample2bam.size() != 1) {
                vcb.filter(filterAllDup.getID());
            }
            if (count_del == sample2bam.size() && sample2bam.size() != 1) {
                vcb.filter(filterAllDel.getID());
            }
            if (n_samples_with_cnv == 0) {
                vcb.filter(filterNoSV.getID());
            }
            if (badestGQ != null) {
                vcb.log10PError(badestGQ / -10.0);
            }
            out.add(vcb.make());
        }
        progress.close();
        out.close();
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iterIn);
        CloserUtil.close(out);
        sample2bam.values().forEach(F -> CloserUtil.close(F));
    }
}
Also used : WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) BiFunction(java.util.function.BiFunction) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) IntervalListProvider(com.github.lindenb.jvarkit.samtools.util.IntervalListProvider) CRAMReferenceSource(htsjdk.samtools.cram.ref.CRAMReferenceSource) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) StringUtil(htsjdk.samtools.util.StringUtil) Map(java.util.Map) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Percentile(com.github.lindenb.jvarkit.math.stats.Percentile) SAMRecord(htsjdk.samtools.SAMRecord) SimplePosition(com.github.lindenb.jvarkit.samtools.util.SimplePosition) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) CoordMath(htsjdk.samtools.util.CoordMath) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) SamReaderFactory(htsjdk.samtools.SamReaderFactory) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) IntStream(java.util.stream.IntStream) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Parameter(com.beust.jcommander.Parameter) OptionalDouble(java.util.OptionalDouble) HashMap(java.util.HashMap) ValidationStringency(htsjdk.samtools.ValidationStringency) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ReferenceSource(htsjdk.samtools.cram.ref.ReferenceSource) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) SAMRecordDefaultFilter(com.github.lindenb.jvarkit.samtools.SAMRecordDefaultFilter) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFConstants(htsjdk.variant.vcf.VCFConstants) Counter(com.github.lindenb.jvarkit.util.Counter) Locatable(htsjdk.samtools.util.Locatable) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) Iterator(java.util.Iterator) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) SamReader(htsjdk.samtools.SamReader) FractionConverter(com.github.lindenb.jvarkit.jcommander.converter.FractionConverter) Closeable(java.io.Closeable) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) RunMedian(com.github.lindenb.jvarkit.math.RunMedian) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Collections(java.util.Collections) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) HashMap(java.util.HashMap) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) ArrayList(java.util.ArrayList) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) CRAMReferenceSource(htsjdk.samtools.cram.ref.CRAMReferenceSource) ReferenceSource(htsjdk.samtools.cram.ref.ReferenceSource) Counter(com.github.lindenb.jvarkit.util.Counter) CRAMReferenceSource(htsjdk.samtools.cram.ref.CRAMReferenceSource) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) HashSet(java.util.HashSet) Genotype(htsjdk.variant.variantcontext.Genotype) CigarOperator(htsjdk.samtools.CigarOperator) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) OptionalDouble(java.util.OptionalDouble) CigarElement(htsjdk.samtools.CigarElement) RunMedian(com.github.lindenb.jvarkit.math.RunMedian) SAMRecord(htsjdk.samtools.SAMRecord) Locatable(htsjdk.samtools.util.Locatable) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IntervalListProvider(com.github.lindenb.jvarkit.samtools.util.IntervalListProvider) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) SimplePosition(com.github.lindenb.jvarkit.samtools.util.SimplePosition) VCFHeader(htsjdk.variant.vcf.VCFHeader) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Path(java.nio.file.Path) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) OptionalDouble(java.util.OptionalDouble) Allele(htsjdk.variant.variantcontext.Allele) Cigar(htsjdk.samtools.Cigar) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder)

Aggregations

SimpleInterval (com.github.lindenb.jvarkit.samtools.util.SimpleInterval)67 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)46 List (java.util.List)45 Path (java.nio.file.Path)44 ArrayList (java.util.ArrayList)44 Locatable (htsjdk.samtools.util.Locatable)42 Parameter (com.beust.jcommander.Parameter)41 Program (com.github.lindenb.jvarkit.util.jcommander.Program)41 Logger (com.github.lindenb.jvarkit.util.log.Logger)41 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)37 Collectors (java.util.stream.Collectors)37 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)35 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)34 CloserUtil (htsjdk.samtools.util.CloserUtil)34 Set (java.util.Set)34 ContigNameConverter (com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter)33 CloseableIterator (htsjdk.samtools.util.CloseableIterator)32 SAMFileHeader (htsjdk.samtools.SAMFileHeader)31 SamReader (htsjdk.samtools.SamReader)31 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)30