Search in sources :

Example 1 with DistanceParser

use of com.github.lindenb.jvarkit.util.bio.DistanceParser in project jvarkit by lindenb.

the class CnvSlidingWindow method doWork.

@Override
public int doWork(final List<String> args) {
    final List<Sample> samples = new ArrayList<>();
    try {
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.refPath);
        final DistanceParser distParser = new DistanceParser();
        final int[] windows_array = Arrays.stream(CharSplitter.SEMICOLON.split(windowDefs)).filter(S -> !StringUtils.isBlank(S)).mapToInt(N -> distParser.applyAsInt(N)).toArray();
        if (windows_array.length == 0) {
            LOG.error("No window defined.");
            return -1;
        }
        if (windows_array.length % 2 != 0) {
            LOG.error("odd number of windows ? " + this.windowDefs);
            return -1;
        }
        final List<Path> inputBams = IOUtils.unrollPaths(args);
        if (inputBams.isEmpty()) {
            LOG.error("input bam file missing.");
            return -1;
        }
        final Set<String> sampleNames = new TreeSet<>();
        for (final Path samFile : inputBams) {
            final Sample sample = new Sample(samFile);
            if (sampleNames.contains(sample.name)) {
                LOG.error("duplicate sample " + sample.name);
                sample.close();
                return -1;
            }
            sampleNames.add(sample.name);
            samples.add(sample);
            SequenceUtil.assertSequenceDictionariesEqual(dict, sample.dict);
        }
        final List<Locatable> contigs = dict.getSequences().stream().filter(SR -> !SR.getSequenceName().matches(this.excludeRegex)).collect(Collectors.toCollection(ArrayList::new));
        if (this.excludeBedPath != null) {
            final BedLineCodec bedLineCodec = new BedLineCodec();
            final ContigNameConverter ctgConverter = ContigNameConverter.fromOneDictionary(dict);
            try (BufferedReader br = IOUtils.openPathForBufferedReading(this.excludeBedPath)) {
                final List<SimpleInterval> exclude = br.lines().filter(L -> !BedLine.isBedHeader(L)).map(L -> bedLineCodec.decode(L)).filter(B -> B != null && !StringUtils.isBlank(ctgConverter.apply(B.getContig()))).map(B -> new SimpleInterval(ctgConverter.apply(B.getContig()), B.getStart(), B.getEnd())).collect(Collectors.toList());
                boolean done = false;
                while (!done) {
                    done = true;
                    int i = 0;
                    while (i < contigs.size()) {
                        final Locatable contig = contigs.get(i);
                        final Locatable overlapper = exclude.stream().filter(EX -> EX.overlaps(contig)).findAny().orElse(null);
                        if (overlapper != null) {
                            contigs.remove(i);
                            contigs.addAll(split(contig, overlapper));
                            done = false;
                        } else {
                            i++;
                        }
                    }
                }
            }
        }
        contigs.sort(new ContigDictComparator(dict).createLocatableComparator());
        final Allele ref_allele = Allele.create("N", true);
        final Allele dup_allele = Allele.create("<DUP>", false);
        final Allele del_allele = Allele.create("<DEL>", false);
        final Function<Integer, List<Allele>> cnv2allele = CNV -> {
            switch(CNV) {
                case 0:
                    return Arrays.asList(ref_allele, ref_allele);
                case 1:
                    return Arrays.asList(ref_allele, dup_allele);
                case 2:
                    return Arrays.asList(dup_allele, dup_allele);
                case -1:
                    return Arrays.asList(ref_allele, del_allele);
                case -2:
                    return Arrays.asList(del_allele, del_allele);
                default:
                    throw new IllegalArgumentException("cnv:" + CNV);
            }
        };
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_KEY);
        VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.END_KEY);
        final VCFFormatHeaderLine fmtLeftCov = new VCFFormatHeaderLine("LC", 1, VCFHeaderLineType.Float, "Left median coverage.");
        final VCFFormatHeaderLine fmtMidCov = new VCFFormatHeaderLine("MC", 1, VCFHeaderLineType.Float, "Middle median coverage.");
        final VCFFormatHeaderLine fmtRightCov = new VCFFormatHeaderLine("RC", 1, VCFHeaderLineType.Float, "right median coverage.");
        final VCFFormatHeaderLine fmtLeftMedian = new VCFFormatHeaderLine("LM", 1, VCFHeaderLineType.Float, "Left normalized median coverage.");
        final VCFFormatHeaderLine fmtMidMedian = new VCFFormatHeaderLine("MM", 1, VCFHeaderLineType.Float, "Middle normalized median coverage.");
        final VCFFormatHeaderLine fmtRightMedian = new VCFFormatHeaderLine("RM", 1, VCFHeaderLineType.Float, "right normalized median coverage.");
        metaData.add(fmtLeftCov);
        metaData.add(fmtMidCov);
        metaData.add(fmtRightCov);
        metaData.add(fmtLeftMedian);
        metaData.add(fmtMidMedian);
        metaData.add(fmtRightMedian);
        final VCFHeader header = new VCFHeader(metaData, sampleNames);
        header.setSequenceDictionary(dict);
        JVarkitVersion.getInstance().addMetaData(this, header);
        final VariantContextWriter vcw = this.writingVariantsDelegate.dictionary(dict).open(this.outputFile);
        vcw.writeHeader(header);
        for (final Locatable contig : contigs) {
            System.gc();
            final short[] array = new short[contig.getLengthOnReference()];
            final SortingCollection<Gt> sorter = SortingCollection.newInstance(Gt.class, new GtCodec(), (A, B) -> A.compare1(B), this.sorting.getMaxRecordsInRam(), this.sorting.getTmpPaths());
            for (int bam_index = 0; bam_index < samples.size(); bam_index++) {
                final Sample sampleBam = samples.get(bam_index);
                Arrays.fill(array, (short) 0);
                try (SAMRecordIterator iter = sampleBam.samReader.queryOverlapping(contig.getContig(), contig.getStart(), contig.getEnd())) {
                    while (iter.hasNext()) {
                        final SAMRecord rec = iter.next();
                        if (rec.getReadUnmappedFlag())
                            continue;
                        if (rec.getDuplicateReadFlag())
                            continue;
                        if (rec.isSecondaryOrSupplementary())
                            continue;
                        if (rec.getReadFailsVendorQualityCheckFlag())
                            continue;
                        final Cigar cigar = rec.getCigar();
                        if (cigar == null || cigar.isEmpty())
                            continue;
                        int refPos = rec.getStart();
                        for (final CigarElement ce : cigar) {
                            final CigarOperator op = ce.getOperator();
                            if (op.consumesReferenceBases()) {
                                if (op.consumesReadBases()) {
                                    for (int i = 0; i < ce.getLength(); i++) {
                                        final int idx = refPos - contig.getStart() + i;
                                        if (idx < 0)
                                            continue;
                                        if (idx >= array.length)
                                            break;
                                        if (array[idx] == Short.MAX_VALUE)
                                            continue;
                                        array[idx]++;
                                    }
                                }
                                refPos += ce.getLength();
                            }
                        }
                    }
                }
                for (int widx = 0; widx < windows_array.length; widx += 2) {
                    final int window_size = windows_array[widx + 0];
                    final int extend = (int) Math.ceil(window_size * this.extend);
                    if (extend <= 0)
                        continue;
                    if (window_size > contig.getLengthOnReference())
                        continue;
                    final int window_shift = windows_array[widx + 1];
                    LOG.info(contig + " " + window_size + "+-" + extend + ";" + window_shift + " " + sampleBam.name);
                    final Coverage leftcov = new Coverage(extend);
                    final Coverage rightcov = new Coverage(extend);
                    final Coverage leftrightcov = new Coverage(extend + extend);
                    final Coverage midcov = new Coverage(window_size);
                    for (int pos1 = contig.getStart(); pos1 + window_size + extend + extend < contig.getEnd(); pos1 += window_shift) {
                        leftcov.reset();
                        rightcov.reset();
                        leftrightcov.reset();
                        midcov.reset();
                        for (int x = 0; x < extend; x++) {
                            final int idx = pos1 - contig.getStart() + x;
                            leftcov.add(array[idx]);
                            leftrightcov.add(array[idx]);
                        }
                        final double leftMedian = leftcov.median();
                        if (leftMedian < this.min_depth)
                            continue;
                        for (int x = 0; x < extend; x++) {
                            final int idx = pos1 - contig.getStart() + extend + window_size + x;
                            rightcov.add(array[idx]);
                            leftrightcov.add(array[idx]);
                        }
                        final double rightMedian = rightcov.median();
                        if (rightMedian < this.min_depth)
                            continue;
                        for (int x = 0; x < window_size; x++) {
                            final int idx = pos1 - contig.getStart() + extend + x;
                            midcov.add(array[idx]);
                        }
                        final double median = leftrightcov.median();
                        if (rightcov.median() < this.min_depth)
                            continue;
                        for (int x = 0; x < midcov.count; x++) {
                            midcov.array[x] /= median;
                        }
                        for (int x = 0; x < leftcov.count; x++) {
                            leftcov.array[x] /= median;
                        }
                        for (int x = 0; x < rightcov.count; x++) {
                            rightcov.array[x] /= median;
                        }
                        final double norm_depth = midcov.median();
                        final int cnv = getCNVIndex(norm_depth);
                        if (cnv != CNV_UNDEFINED && cnv != 1 && getCNVIndex(leftcov.median()) == 1 && getCNVIndex(rightcov.median()) == 1) {
                            final Gt gt = new Gt();
                            gt.start = pos1 + extend;
                            gt.end = gt.start + window_size;
                            gt.sample_idx = bam_index;
                            gt.cnv = cnv;
                            gt.leftMedian = (float) leftMedian;
                            gt.midMedian = (float) median;
                            gt.rightMedian = (float) rightMedian;
                            gt.leftCov = (float) leftcov.median();
                            gt.midCov = (float) norm_depth;
                            gt.rightCov = (float) rightcov.median();
                            sorter.add(gt);
                        }
                    }
                }
            }
            sorter.setDestructiveIteration(true);
            try (CloseableIterator<Gt> iter = sorter.iterator()) {
                final EqualRangeIterator<Gt> eq = new EqualRangeIterator<>(iter, (A, B) -> A.compare0(B));
                while (eq.hasNext()) {
                    final List<Gt> row = eq.next();
                    if (row.isEmpty())
                        continue;
                    final Gt first = row.get(0);
                    final Set<Allele> alleles = new HashSet<>();
                    row.stream().flatMap(GT -> cnv2allele.apply(GT.cnv).stream()).forEach(CNV -> alleles.add(CNV));
                    alleles.add(ref_allele);
                    if (alleles.size() < 2)
                        continue;
                    final VariantContextBuilder vcb = new VariantContextBuilder(null, contig.getContig(), first.start, first.end, alleles);
                    vcb.attribute(VCFConstants.END_KEY, first.end);
                    final List<Genotype> genotypes = new ArrayList<>(samples.size());
                    for (final Gt gt : row) {
                        final GenotypeBuilder gb = new GenotypeBuilder(samples.get(gt.sample_idx).name, cnv2allele.apply(gt.cnv));
                        gb.attribute(fmtLeftMedian.getID(), (double) gt.leftMedian);
                        gb.attribute(fmtMidMedian.getID(), (double) gt.midMedian);
                        gb.attribute(fmtRightMedian.getID(), (double) gt.rightMedian);
                        gb.attribute(fmtLeftCov.getID(), (double) gt.leftCov);
                        gb.attribute(fmtMidCov.getID(), (double) gt.midCov);
                        gb.attribute(fmtRightCov.getID(), (double) gt.rightCov);
                        genotypes.add(gb.make());
                    }
                    vcb.genotypes(genotypes);
                    vcw.add(vcb.make());
                }
                eq.close();
            }
            sorter.cleanup();
        }
        vcw.close();
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        samples.forEach(S -> CloserUtil.close(S));
    }
}
Also used : WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) DataOutputStream(java.io.DataOutputStream) StringUtil(htsjdk.samtools.util.StringUtil) AbstractDataCodec(com.github.lindenb.jvarkit.util.picard.AbstractDataCodec) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) EOFException(java.io.EOFException) Collectors(java.util.stream.Collectors) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) SamReaderFactory(htsjdk.samtools.SamReaderFactory) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) DataInputStream(java.io.DataInputStream) Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) SequenceUtil(htsjdk.samtools.util.SequenceUtil) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) Function(java.util.function.Function) ValidationStringency(htsjdk.samtools.ValidationStringency) TreeSet(java.util.TreeSet) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFConstants(htsjdk.variant.vcf.VCFConstants) Locatable(htsjdk.samtools.util.Locatable) SortingCollection(htsjdk.samtools.util.SortingCollection) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) SamReader(htsjdk.samtools.SamReader) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) Closeable(java.io.Closeable) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) BufferedReader(java.io.BufferedReader) Collections(java.util.Collections) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) ArrayList(java.util.ArrayList) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) TreeSet(java.util.TreeSet) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) List(java.util.List) ArrayList(java.util.ArrayList) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) HashSet(java.util.HashSet) Genotype(htsjdk.variant.variantcontext.Genotype) CigarOperator(htsjdk.samtools.CigarOperator) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) CigarElement(htsjdk.samtools.CigarElement) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) SAMRecord(htsjdk.samtools.SAMRecord) Locatable(htsjdk.samtools.util.Locatable) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Path(java.nio.file.Path) Allele(htsjdk.variant.variantcontext.Allele) Cigar(htsjdk.samtools.Cigar) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) BufferedReader(java.io.BufferedReader)

Aggregations

Parameter (com.beust.jcommander.Parameter)1 ParametersDelegate (com.beust.jcommander.ParametersDelegate)1 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)1 CharSplitter (com.github.lindenb.jvarkit.lang.CharSplitter)1 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)1 SimpleInterval (com.github.lindenb.jvarkit.samtools.util.SimpleInterval)1 JVarkitVersion (com.github.lindenb.jvarkit.util.JVarkitVersion)1 DistanceParser (com.github.lindenb.jvarkit.util.bio.DistanceParser)1 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)1 BedLine (com.github.lindenb.jvarkit.util.bio.bed.BedLine)1 BedLineCodec (com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec)1 ContigNameConverter (com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter)1 EqualRangeIterator (com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator)1 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)1 Program (com.github.lindenb.jvarkit.util.jcommander.Program)1 Logger (com.github.lindenb.jvarkit.util.log.Logger)1 AbstractDataCodec (com.github.lindenb.jvarkit.util.picard.AbstractDataCodec)1 ContigDictComparator (com.github.lindenb.jvarkit.util.samtools.ContigDictComparator)1 WritingVariantsDelegate (com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate)1 Cigar (htsjdk.samtools.Cigar)1