Search in sources :

Example 16 with ContigDictComparator

use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.

the class CnvSlidingWindow method doWork.

@Override
public int doWork(final List<String> args) {
    final List<Sample> samples = new ArrayList<>();
    try {
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.refPath);
        final DistanceParser distParser = new DistanceParser();
        final int[] windows_array = Arrays.stream(CharSplitter.SEMICOLON.split(windowDefs)).filter(S -> !StringUtils.isBlank(S)).mapToInt(N -> distParser.applyAsInt(N)).toArray();
        if (windows_array.length == 0) {
            LOG.error("No window defined.");
            return -1;
        }
        if (windows_array.length % 2 != 0) {
            LOG.error("odd number of windows ? " + this.windowDefs);
            return -1;
        }
        final List<Path> inputBams = IOUtils.unrollPaths(args);
        if (inputBams.isEmpty()) {
            LOG.error("input bam file missing.");
            return -1;
        }
        final Set<String> sampleNames = new TreeSet<>();
        for (final Path samFile : inputBams) {
            final Sample sample = new Sample(samFile);
            if (sampleNames.contains(sample.name)) {
                LOG.error("duplicate sample " + sample.name);
                sample.close();
                return -1;
            }
            sampleNames.add(sample.name);
            samples.add(sample);
            SequenceUtil.assertSequenceDictionariesEqual(dict, sample.dict);
        }
        final List<Locatable> contigs = dict.getSequences().stream().filter(SR -> !SR.getSequenceName().matches(this.excludeRegex)).collect(Collectors.toCollection(ArrayList::new));
        if (this.excludeBedPath != null) {
            final BedLineCodec bedLineCodec = new BedLineCodec();
            final ContigNameConverter ctgConverter = ContigNameConverter.fromOneDictionary(dict);
            try (BufferedReader br = IOUtils.openPathForBufferedReading(this.excludeBedPath)) {
                final List<SimpleInterval> exclude = br.lines().filter(L -> !BedLine.isBedHeader(L)).map(L -> bedLineCodec.decode(L)).filter(B -> B != null && !StringUtils.isBlank(ctgConverter.apply(B.getContig()))).map(B -> new SimpleInterval(ctgConverter.apply(B.getContig()), B.getStart(), B.getEnd())).collect(Collectors.toList());
                boolean done = false;
                while (!done) {
                    done = true;
                    int i = 0;
                    while (i < contigs.size()) {
                        final Locatable contig = contigs.get(i);
                        final Locatable overlapper = exclude.stream().filter(EX -> EX.overlaps(contig)).findAny().orElse(null);
                        if (overlapper != null) {
                            contigs.remove(i);
                            contigs.addAll(split(contig, overlapper));
                            done = false;
                        } else {
                            i++;
                        }
                    }
                }
            }
        }
        contigs.sort(new ContigDictComparator(dict).createLocatableComparator());
        final Allele ref_allele = Allele.create("N", true);
        final Allele dup_allele = Allele.create("<DUP>", false);
        final Allele del_allele = Allele.create("<DEL>", false);
        final Function<Integer, List<Allele>> cnv2allele = CNV -> {
            switch(CNV) {
                case 0:
                    return Arrays.asList(ref_allele, ref_allele);
                case 1:
                    return Arrays.asList(ref_allele, dup_allele);
                case 2:
                    return Arrays.asList(dup_allele, dup_allele);
                case -1:
                    return Arrays.asList(ref_allele, del_allele);
                case -2:
                    return Arrays.asList(del_allele, del_allele);
                default:
                    throw new IllegalArgumentException("cnv:" + CNV);
            }
        };
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_KEY);
        VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.END_KEY);
        final VCFFormatHeaderLine fmtLeftCov = new VCFFormatHeaderLine("LC", 1, VCFHeaderLineType.Float, "Left median coverage.");
        final VCFFormatHeaderLine fmtMidCov = new VCFFormatHeaderLine("MC", 1, VCFHeaderLineType.Float, "Middle median coverage.");
        final VCFFormatHeaderLine fmtRightCov = new VCFFormatHeaderLine("RC", 1, VCFHeaderLineType.Float, "right median coverage.");
        final VCFFormatHeaderLine fmtLeftMedian = new VCFFormatHeaderLine("LM", 1, VCFHeaderLineType.Float, "Left normalized median coverage.");
        final VCFFormatHeaderLine fmtMidMedian = new VCFFormatHeaderLine("MM", 1, VCFHeaderLineType.Float, "Middle normalized median coverage.");
        final VCFFormatHeaderLine fmtRightMedian = new VCFFormatHeaderLine("RM", 1, VCFHeaderLineType.Float, "right normalized median coverage.");
        metaData.add(fmtLeftCov);
        metaData.add(fmtMidCov);
        metaData.add(fmtRightCov);
        metaData.add(fmtLeftMedian);
        metaData.add(fmtMidMedian);
        metaData.add(fmtRightMedian);
        final VCFHeader header = new VCFHeader(metaData, sampleNames);
        header.setSequenceDictionary(dict);
        JVarkitVersion.getInstance().addMetaData(this, header);
        final VariantContextWriter vcw = this.writingVariantsDelegate.dictionary(dict).open(this.outputFile);
        vcw.writeHeader(header);
        for (final Locatable contig : contigs) {
            System.gc();
            final short[] array = new short[contig.getLengthOnReference()];
            final SortingCollection<Gt> sorter = SortingCollection.newInstance(Gt.class, new GtCodec(), (A, B) -> A.compare1(B), this.sorting.getMaxRecordsInRam(), this.sorting.getTmpPaths());
            for (int bam_index = 0; bam_index < samples.size(); bam_index++) {
                final Sample sampleBam = samples.get(bam_index);
                Arrays.fill(array, (short) 0);
                try (SAMRecordIterator iter = sampleBam.samReader.queryOverlapping(contig.getContig(), contig.getStart(), contig.getEnd())) {
                    while (iter.hasNext()) {
                        final SAMRecord rec = iter.next();
                        if (rec.getReadUnmappedFlag())
                            continue;
                        if (rec.getDuplicateReadFlag())
                            continue;
                        if (rec.isSecondaryOrSupplementary())
                            continue;
                        if (rec.getReadFailsVendorQualityCheckFlag())
                            continue;
                        final Cigar cigar = rec.getCigar();
                        if (cigar == null || cigar.isEmpty())
                            continue;
                        int refPos = rec.getStart();
                        for (final CigarElement ce : cigar) {
                            final CigarOperator op = ce.getOperator();
                            if (op.consumesReferenceBases()) {
                                if (op.consumesReadBases()) {
                                    for (int i = 0; i < ce.getLength(); i++) {
                                        final int idx = refPos - contig.getStart() + i;
                                        if (idx < 0)
                                            continue;
                                        if (idx >= array.length)
                                            break;
                                        if (array[idx] == Short.MAX_VALUE)
                                            continue;
                                        array[idx]++;
                                    }
                                }
                                refPos += ce.getLength();
                            }
                        }
                    }
                }
                for (int widx = 0; widx < windows_array.length; widx += 2) {
                    final int window_size = windows_array[widx + 0];
                    final int extend = (int) Math.ceil(window_size * this.extend);
                    if (extend <= 0)
                        continue;
                    if (window_size > contig.getLengthOnReference())
                        continue;
                    final int window_shift = windows_array[widx + 1];
                    LOG.info(contig + " " + window_size + "+-" + extend + ";" + window_shift + " " + sampleBam.name);
                    final Coverage leftcov = new Coverage(extend);
                    final Coverage rightcov = new Coverage(extend);
                    final Coverage leftrightcov = new Coverage(extend + extend);
                    final Coverage midcov = new Coverage(window_size);
                    for (int pos1 = contig.getStart(); pos1 + window_size + extend + extend < contig.getEnd(); pos1 += window_shift) {
                        leftcov.reset();
                        rightcov.reset();
                        leftrightcov.reset();
                        midcov.reset();
                        for (int x = 0; x < extend; x++) {
                            final int idx = pos1 - contig.getStart() + x;
                            leftcov.add(array[idx]);
                            leftrightcov.add(array[idx]);
                        }
                        final double leftMedian = leftcov.median();
                        if (leftMedian < this.min_depth)
                            continue;
                        for (int x = 0; x < extend; x++) {
                            final int idx = pos1 - contig.getStart() + extend + window_size + x;
                            rightcov.add(array[idx]);
                            leftrightcov.add(array[idx]);
                        }
                        final double rightMedian = rightcov.median();
                        if (rightMedian < this.min_depth)
                            continue;
                        for (int x = 0; x < window_size; x++) {
                            final int idx = pos1 - contig.getStart() + extend + x;
                            midcov.add(array[idx]);
                        }
                        final double median = leftrightcov.median();
                        if (rightcov.median() < this.min_depth)
                            continue;
                        for (int x = 0; x < midcov.count; x++) {
                            midcov.array[x] /= median;
                        }
                        for (int x = 0; x < leftcov.count; x++) {
                            leftcov.array[x] /= median;
                        }
                        for (int x = 0; x < rightcov.count; x++) {
                            rightcov.array[x] /= median;
                        }
                        final double norm_depth = midcov.median();
                        final int cnv = getCNVIndex(norm_depth);
                        if (cnv != CNV_UNDEFINED && cnv != 1 && getCNVIndex(leftcov.median()) == 1 && getCNVIndex(rightcov.median()) == 1) {
                            final Gt gt = new Gt();
                            gt.start = pos1 + extend;
                            gt.end = gt.start + window_size;
                            gt.sample_idx = bam_index;
                            gt.cnv = cnv;
                            gt.leftMedian = (float) leftMedian;
                            gt.midMedian = (float) median;
                            gt.rightMedian = (float) rightMedian;
                            gt.leftCov = (float) leftcov.median();
                            gt.midCov = (float) norm_depth;
                            gt.rightCov = (float) rightcov.median();
                            sorter.add(gt);
                        }
                    }
                }
            }
            sorter.setDestructiveIteration(true);
            try (CloseableIterator<Gt> iter = sorter.iterator()) {
                final EqualRangeIterator<Gt> eq = new EqualRangeIterator<>(iter, (A, B) -> A.compare0(B));
                while (eq.hasNext()) {
                    final List<Gt> row = eq.next();
                    if (row.isEmpty())
                        continue;
                    final Gt first = row.get(0);
                    final Set<Allele> alleles = new HashSet<>();
                    row.stream().flatMap(GT -> cnv2allele.apply(GT.cnv).stream()).forEach(CNV -> alleles.add(CNV));
                    alleles.add(ref_allele);
                    if (alleles.size() < 2)
                        continue;
                    final VariantContextBuilder vcb = new VariantContextBuilder(null, contig.getContig(), first.start, first.end, alleles);
                    vcb.attribute(VCFConstants.END_KEY, first.end);
                    final List<Genotype> genotypes = new ArrayList<>(samples.size());
                    for (final Gt gt : row) {
                        final GenotypeBuilder gb = new GenotypeBuilder(samples.get(gt.sample_idx).name, cnv2allele.apply(gt.cnv));
                        gb.attribute(fmtLeftMedian.getID(), (double) gt.leftMedian);
                        gb.attribute(fmtMidMedian.getID(), (double) gt.midMedian);
                        gb.attribute(fmtRightMedian.getID(), (double) gt.rightMedian);
                        gb.attribute(fmtLeftCov.getID(), (double) gt.leftCov);
                        gb.attribute(fmtMidCov.getID(), (double) gt.midCov);
                        gb.attribute(fmtRightCov.getID(), (double) gt.rightCov);
                        genotypes.add(gb.make());
                    }
                    vcb.genotypes(genotypes);
                    vcw.add(vcb.make());
                }
                eq.close();
            }
            sorter.cleanup();
        }
        vcw.close();
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        samples.forEach(S -> CloserUtil.close(S));
    }
}
Also used : WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) DataOutputStream(java.io.DataOutputStream) StringUtil(htsjdk.samtools.util.StringUtil) AbstractDataCodec(com.github.lindenb.jvarkit.util.picard.AbstractDataCodec) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) EOFException(java.io.EOFException) Collectors(java.util.stream.Collectors) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) SamReaderFactory(htsjdk.samtools.SamReaderFactory) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) DataInputStream(java.io.DataInputStream) Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) SequenceUtil(htsjdk.samtools.util.SequenceUtil) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) Function(java.util.function.Function) ValidationStringency(htsjdk.samtools.ValidationStringency) TreeSet(java.util.TreeSet) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFConstants(htsjdk.variant.vcf.VCFConstants) Locatable(htsjdk.samtools.util.Locatable) SortingCollection(htsjdk.samtools.util.SortingCollection) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) SamReader(htsjdk.samtools.SamReader) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) Closeable(java.io.Closeable) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) BufferedReader(java.io.BufferedReader) Collections(java.util.Collections) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) ArrayList(java.util.ArrayList) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) TreeSet(java.util.TreeSet) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) List(java.util.List) ArrayList(java.util.ArrayList) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) HashSet(java.util.HashSet) Genotype(htsjdk.variant.variantcontext.Genotype) CigarOperator(htsjdk.samtools.CigarOperator) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) CigarElement(htsjdk.samtools.CigarElement) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) SAMRecord(htsjdk.samtools.SAMRecord) Locatable(htsjdk.samtools.util.Locatable) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Path(java.nio.file.Path) Allele(htsjdk.variant.variantcontext.Allele) Cigar(htsjdk.samtools.Cigar) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) BufferedReader(java.io.BufferedReader)

Example 17 with ContigDictComparator

use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.

the class GtfRetroCopy method doWork.

@Override
public int doWork(final List<String> args) {
    VCFReader vcfFileReader = null;
    VariantContextWriter vcw0 = null;
    try {
        /* load the reference genome */
        /* create a contig name converter from the REF */
        final Set<String> knownGeneIds;
        if (this.knownPath != null) {
            try (BufferedReader br = IOUtils.openPathForBufferedReading(this.knownPath)) {
                knownGeneIds = br.lines().filter(L -> !StringUtils.isBlank(L)).map(S -> S.trim()).filter(S -> !(S.equals("-") || S.equals(".") || S.startsWith("#"))).collect(Collectors.toSet());
            }
        } else {
            knownGeneIds = Collections.emptySet();
        }
        // open the sam file
        final String input = oneAndOnlyOneFile(args);
        vcfFileReader = VCFReaderFactory.makeDefault().open(Paths.get(input), true);
        final VCFHeader header = vcfFileReader.getHeader();
        final SAMSequenceDictionary dict = header.getSequenceDictionary();
        final Comparator<String> contigCmp = dict == null ? (A, B) -> A.compareTo(B) : new ContigDictComparator(dict);
        final Comparator<Gene> geneCmp = (A, B) -> {
            int i = contigCmp.compare(A.getContig(), B.getContig());
            if (i != 0)
                return i;
            i = Integer.compare(A.getStart(), B.getStart());
            if (i != 0)
                return i;
            return Integer.compare(A.getEnd(), B.getEnd());
        };
        final GtfReader gtfReader = new GtfReader(this.gtfPath);
        if (dict != null && !dict.isEmpty()) {
            this.writingVcf.dictionary(dict);
            gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
        }
        final List<Gene> genes = gtfReader.getAllGenes().stream().filter(G -> G.getTranscripts().stream().count() > 0L).filter(G -> G.getTranscripts().stream().anyMatch(T -> T.getIntronCount() >= this.min_intron_count)).sorted(geneCmp).collect(Collectors.toList());
        gtfReader.close();
        /**
         * build vcf header
         */
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_QUALITY_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY, true));
        metaData.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Variation type"));
        metaData.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Variation Length"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_BOUNDS, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Introns boundaries"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_SIZES, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Introns sizes"));
        metaData.add(new VCFFilterHeaderLine(ATT_NOT_ALL_INTRONS, "Not all introns were found retrocopied"));
        for (final String att : ENSEMBL_TRANSCRIPT_ATTS) {
            metaData.add(new VCFInfoHeaderLine(att, 1, VCFHeaderLineType.String, "Value for the attribute '" + att + "' in the gtf"));
        }
        // metaData.add(new VCFFormatHeaderLine(ATT_COUNT_SUPPORTING_READS, 2,VCFHeaderLineType.Integer,"Count supporting reads [intron-left/intron-right]"));
        // metaData.add(new VCFInfoHeaderLine(ATT_RETRO_DESC, VCFHeaderLineCount.UNBOUNDED,VCFHeaderLineType.String,
        // "Retrocopy attributes: transcript-id|strand|exon-left|exon-left-bases|exon-right-bases|exon-right"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns for the Transcript"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns found retrocopied for the transcript"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_FRACTION, 1, VCFHeaderLineType.Float, "Fraction of introns found retrocopied for the transcript"));
        metaData.add(new VCFFilterHeaderLine(ATT_NOT_ALL_INTRONS, "Not all introns were found retrocopied"));
        metaData.add(new VCFFilterHeaderLine(ATT_KNOWN, "Known RetroGenes. " + (this.knownPath == null ? "" : " Source: " + this.knownPath)));
        final VCFHeader header2 = new VCFHeader(header);
        metaData.stream().forEach(H -> header2.addMetaDataLine(H));
        JVarkitVersion.getInstance().addMetaData(this, header2);
        final Allele ref = Allele.create((byte) 'N', true);
        final Allele alt = Allele.create("<RETROCOPY>", false);
        /* open vcf for writing*/
        vcw0 = this.writingVcf.open(this.outputFile);
        vcw0.writeHeader(header2);
        final ProgressFactory.Watcher<Gene> progress = ProgressFactory.newInstance().logger(LOG).dictionary(dict).build();
        for (final Gene gene : genes) {
            progress.apply(gene);
            final List<VariantContext> variants = new ArrayList<>();
            final CloseableIterator<VariantContext> iter2 = vcfFileReader.query(gene.getContig(), gene.getStart(), gene.getEnd());
            while (iter2.hasNext()) {
                final VariantContext ctx = iter2.next();
                // SNV
                if (ctx.getStart() == ctx.getEnd())
                    continue;
                StructuralVariantType svType = ctx.getStructuralVariantType();
                if (StructuralVariantType.BND.equals(svType))
                    continue;
                if (StructuralVariantType.INS.equals(svType))
                    continue;
                variants.add(ctx);
            }
            iter2.close();
            if (variants.isEmpty())
                continue;
            for (final Transcript transcript : gene.getTranscripts()) {
                if (!transcript.hasIntron())
                    continue;
                if (transcript.getIntronCount() < this.min_intron_count)
                    continue;
                final Counter<String> samples = new Counter<>();
                for (final Intron intron : transcript.getIntrons()) {
                    for (final VariantContext ctx : variants) {
                        if (!isWithinDistance(intron.getStart(), ctx.getStart()))
                            continue;
                        if (!isWithinDistance(intron.getEnd(), ctx.getEnd()))
                            continue;
                        if (ctx.hasGenotypes()) {
                            for (final Genotype g : ctx.getGenotypes()) {
                                if (g.isNoCall() || g.isHomRef())
                                    continue;
                                samples.incr(g.getSampleName());
                            }
                        } else {
                            samples.incr("*");
                        }
                    }
                // end iter2
                }
                // end intron
                final long max_count = samples.stream().mapToLong(E -> E.getValue()).max().orElse(0L);
                if (max_count == 0)
                    continue;
                if (this.only_all_introns && max_count != transcript.getIntronCount())
                    continue;
                // ok good candidate
                final VariantContextBuilder vcb = new VariantContextBuilder();
                vcb.chr(transcript.getContig());
                vcb.start(transcript.getStart());
                vcb.stop(transcript.getEnd());
                switch(this.idKey) {
                    case gene_name:
                        final String gn = transcript.getGene().getGeneName();
                        vcb.id(StringUtils.isBlank(gn) ? transcript.getId() : gn);
                        break;
                    case gene_id:
                        vcb.id(transcript.getGene().getId());
                        break;
                    case transcript_id:
                        vcb.id(transcript.getId());
                        break;
                    default:
                        throw new IllegalStateException();
                }
                final List<Allele> alleles = Arrays.asList(ref, alt);
                // vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY,2);
                // vcb.attribute(VCFConstants.ALLELE_COUNT_KEY,1);
                // vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY,0.5);
                vcb.attribute(VCFConstants.SVTYPE, "DEL");
                vcb.attribute(VCFConstants.END_KEY, transcript.getEnd());
                vcb.attribute("SVLEN", transcript.getLengthOnReference());
                vcb.attribute(ATT_INTRONS_BOUNDS, transcript.getIntrons().stream().map(S -> "" + S.getStart() + "-" + S.getEnd()).collect(Collectors.toList()));
                vcb.attribute(ATT_INTRONS_SIZES, transcript.getIntrons().stream().mapToInt(S -> S.getLengthOnReference()).toArray());
                for (final String att : ENSEMBL_TRANSCRIPT_ATTS) {
                    final String v = transcript.getProperties().get(att);
                    if (StringUtils.isBlank(v))
                        continue;
                    vcb.attribute(att, v);
                }
                vcb.alleles(alleles);
                boolean pass_filter = true;
                // introns sequences
                vcb.attribute(ATT_INTRONS_CANDIDATE_COUNT, max_count);
                vcb.attribute(ATT_INTRONS_COUNT, transcript.getIntronCount());
                vcb.attribute(ATT_INTRONS_CANDIDATE_FRACTION, max_count / (float) transcript.getIntronCount());
                if (transcript.getIntronCount() != max_count) {
                    vcb.filter(ATT_NOT_ALL_INTRONS);
                    pass_filter = false;
                }
                if (knownGeneIds.contains(transcript.getGene().getId())) {
                    vcb.filter(ATT_KNOWN);
                    pass_filter = false;
                }
                if (header.hasGenotypingData()) {
                    final List<Genotype> genotypes = new ArrayList<>();
                    for (final String sn : header.getSampleNamesInOrder()) {
                        final List<Allele> gtalleles;
                        if (samples.count(sn) == 0L) {
                            gtalleles = Arrays.asList(ref, ref);
                        } else {
                            gtalleles = Arrays.asList(ref, alt);
                        }
                        final GenotypeBuilder gb = new GenotypeBuilder(sn, gtalleles);
                        genotypes.add(gb.make());
                    }
                    vcb.genotypes(genotypes);
                }
                if (pass_filter)
                    vcb.passFilters();
                vcw0.add(vcb.make());
            }
        }
        progress.close();
        vcw0.close();
        vcfFileReader.close();
        vcfFileReader = null;
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(vcfFileReader);
        CloserUtil.close(vcw0);
    }
}
Also used : WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) VCFHeader(htsjdk.variant.vcf.VCFHeader) Gene(com.github.lindenb.jvarkit.util.bio.structure.Gene) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) VCFConstants(htsjdk.variant.vcf.VCFConstants) Counter(com.github.lindenb.jvarkit.util.Counter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) VCFReader(htsjdk.variant.vcf.VCFReader) Set(java.util.Set) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) Collectors(java.util.stream.Collectors) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) List(java.util.List) Intron(com.github.lindenb.jvarkit.util.bio.structure.Intron) Paths(java.nio.file.Paths) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) StructuralVariantType(htsjdk.variant.variantcontext.StructuralVariantType) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) BufferedReader(java.io.BufferedReader) Comparator(java.util.Comparator) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) Collections(java.util.Collections) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Counter(com.github.lindenb.jvarkit.util.Counter) Gene(com.github.lindenb.jvarkit.util.bio.structure.Gene) VCFReader(htsjdk.variant.vcf.VCFReader) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) Intron(com.github.lindenb.jvarkit.util.bio.structure.Intron) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) BufferedReader(java.io.BufferedReader) StructuralVariantType(htsjdk.variant.variantcontext.StructuralVariantType)

Example 18 with ContigDictComparator

use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.

the class ReferenceToHtml method doWork.

@Override
public int doWork(final List<String> args) {
    try {
        try (VCFReader reader = VCFReaderFactory.makeDefault().open(Paths.get(oneAndOnlyOneFile(args)), true)) {
            final VCFHeader header = reader.getHeader();
            final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
            final Optional<String> buildName = SequenceDictionaryUtils.getBuildName(dict);
            try (ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx)) {
                SequenceUtil.assertSequenceDictionariesEqual(SequenceDictionaryUtils.extractRequired(reference), dict);
                final List<Locatable> locs = IntervalListProvider.from(this.regionsStr).dictionary(dict).stream().sorted(new ContigDictComparator(dict).createLocatableComparator()).collect(Collectors.toList());
                if (locs.isEmpty()) {
                    LOG.error("no region was defined");
                    return -1;
                }
                final XMLOutputFactory xof = XMLOutputFactory.newFactory();
                try (ArchiveFactory archive = ArchiveFactory.open(archiveOutput)) {
                    try (PrintWriter pw = archive.openWriter(this.prefix + "script.js")) {
                        pw.println(StaticCodeExtractor.forClass(this.getClass()).extract("SCRIPT").get());
                        pw.flush();
                    }
                    try (PrintWriter pw = archive.openWriter(this.prefix + "style.css")) {
                        pw.println(StaticCodeExtractor.forClass(this.getClass()).extract("CSS").get());
                        pw.flush();
                    }
                    final OutputStream index_os = archive.openOuputStream(this.prefix + "index.html");
                    final XMLStreamWriter index_html = xof.createXMLStreamWriter(index_os, "UTF-8");
                    index_html.writeStartDocument("UTF-8", "1.0");
                    index_html.writeStartElement("html");
                    index_html.writeStartElement("head");
                    writeMeta(index_html);
                    index_html.writeStartElement("title");
                    index_html.writeCharacters(this.getProgramName());
                    // title
                    index_html.writeEndElement();
                    index_html.writeStartElement("link");
                    index_html.writeAttribute("rel", "stylesheet");
                    index_html.writeAttribute("href", this.prefix + "style.css");
                    index_html.writeCharacters("");
                    // link
                    index_html.writeEndElement();
                    // head
                    index_html.writeEndElement();
                    index_html.writeStartElement("body");
                    index_html.writeStartElement("ul");
                    for (final Locatable loc : locs) {
                        final List<VariantContext> variants;
                        try (CloseableIterator<VariantContext> iter = reader.query(loc)) {
                            variants = iter.stream().collect(Collectors.toList());
                        }
                        StringWriter sw = new StringWriter();
                        sw.append("var g = ");
                        JsonWriter jsw = new JsonWriter(sw);
                        jsw.beginObject();
                        jsw.name("contig").value(loc.getContig());
                        jsw.name("start").value(loc.getStart());
                        jsw.name("end").value(loc.getEnd());
                        jsw.name("length").value(loc.getLengthOnReference());
                        jsw.name("sequence").value(reference.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getEnd()).getBaseString());
                        jsw.name("variants");
                        jsw.beginArray();
                        for (VariantContext ctx : variants) {
                            jsw.beginObject();
                            jsw.name("start").value(ctx.getStart());
                            jsw.name("end").value(ctx.getEnd());
                            jsw.name("id").value(ctx.hasID() ? ctx.getID() : "");
                            jsw.name("type").value(ctx.getType().name());
                            jsw.name("ref").value(ctx.getReference().getDisplayString());
                            jsw.name("alts");
                            jsw.beginArray();
                            for (Allele alt : ctx.getAlternateAlleles()) {
                                jsw.value(alt.getDisplayString());
                            }
                            jsw.endArray();
                            jsw.endObject();
                        }
                        jsw.endArray();
                        jsw.endObject();
                        jsw.flush();
                        jsw.close();
                        sw.append(";");
                        final String filename = this.prefix + loc.getContig() + "_" + loc.getStart() + "_" + loc.getEnd() + ".html";
                        final String title = (buildName.isPresent() ? buildName.get() + " " : "") + new SimpleInterval(loc).toNiceString();
                        OutputStream os = archive.openOuputStream(filename);
                        XMLStreamWriter out = xof.createXMLStreamWriter(os, "UTF-8");
                        out.writeStartDocument("UTF-8", "1.0");
                        out.writeStartElement("html");
                        out.writeStartElement("head");
                        writeMeta(out);
                        out.writeStartElement("script");
                        out.writeCharacters(sw.toString());
                        // script
                        out.writeEndElement();
                        out.writeStartElement("script");
                        out.writeAttribute("src", this.prefix + "script.js");
                        out.writeCharacters("");
                        // script
                        out.writeEndElement();
                        out.writeStartElement("link");
                        out.writeAttribute("rel", "stylesheet");
                        out.writeAttribute("href", this.prefix + "style.css");
                        out.writeCharacters("");
                        // link
                        out.writeEndElement();
                        out.writeStartElement("title");
                        out.writeCharacters(title + " N=" + StringUtils.niceInt(variants.size()));
                        // title
                        out.writeEndElement();
                        // head
                        out.writeEndElement();
                        out.writeStartElement("body");
                        out.writeStartElement("h1");
                        out.writeCharacters(title + " N=" + StringUtils.niceInt(variants.size()));
                        out.writeEndElement();
                        out.writeStartElement("div");
                        checkBox(out, "showPos", "Line Number");
                        checkBox(out, "showSpace", "Spaces");
                        checkBox(out, "showVar", "Show Variants");
                        out.writeCharacters(" ");
                        out.writeEmptyElement("input");
                        out.writeAttribute("id", "primer");
                        out.writeAttribute("type", "text");
                        out.writeAttribute("placeholder", "Primer Sequence");
                        out.writeStartElement("button");
                        out.writeAttribute("id", "search");
                        out.writeCharacters("Search...");
                        out.writeEndElement();
                        // div
                        out.writeEndElement();
                        out.writeStartElement("div");
                        out.writeAttribute("class", "sequence");
                        out.writeAttribute("id", "main");
                        // div
                        out.writeEndElement();
                        // body
                        out.writeEndElement();
                        // html
                        out.writeEndElement();
                        os.flush();
                        os.close();
                        index_html.writeStartElement("li");
                        index_html.writeStartElement("a");
                        index_html.writeAttribute("href", filename);
                        index_html.writeCharacters(title + " N=" + StringUtils.niceInt(variants.size()));
                        // a
                        index_html.writeEndElement();
                        // li
                        index_html.writeEndElement();
                    }
                    // ul
                    index_html.writeEndElement();
                    // body
                    index_html.writeEndElement();
                    // html
                    index_html.writeEndElement();
                    index_html.writeEndDocument();
                    index_html.close();
                    index_os.flush();
                    index_os.close();
                }
            }
        }
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    }
}
Also used : ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) XMLOutputFactory(javax.xml.stream.XMLOutputFactory) OutputStream(java.io.OutputStream) VariantContext(htsjdk.variant.variantcontext.VariantContext) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) JsonWriter(com.google.gson.stream.JsonWriter) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Allele(htsjdk.variant.variantcontext.Allele) StringWriter(java.io.StringWriter) VCFReader(htsjdk.variant.vcf.VCFReader) XMLStreamWriter(javax.xml.stream.XMLStreamWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) VCFHeader(htsjdk.variant.vcf.VCFHeader) Locatable(htsjdk.samtools.util.Locatable) PrintWriter(java.io.PrintWriter)

Example 19 with ContigDictComparator

use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.

the class CopyNumber01 method doWork.

@Override
public int doWork(final List<String> args) {
    ReferenceSequenceFile indexedFastaSequenceFile = null;
    try {
        this.sexContigSet.addAll(Arrays.stream(this.sexContigStr.split("[, \t]")).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toSet()));
        /* loading REF Reference */
        indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile);
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(indexedFastaSequenceFile);
        final Comparator<Locatable> locComparator = new ContigDictComparator(dict).createLocatableComparator();
        final List<Locatable> intervals = new ArrayList<>();
        if (this.bedFile == null) {
            for (final Locatable loc : dict.getSequences()) {
                intervals.add(loc);
            }
        } else {
            final ContigNameConverter converter = ContigNameConverter.fromOneDictionary(dict);
            try (BedLineReader br = new BedLineReader(this.bedFile)) {
                br.stream().filter(L -> !StringUtil.isBlank(converter.apply(L.getContig()))).forEach(B -> {
                    final String ctg = converter.apply(B.getContig());
                    intervals.add(new SimpleInterval(ctg, B.getStart(), B.getEnd()));
                });
            }
        }
        if (intervals.isEmpty()) {
            LOG.error("no interval defined.");
            return -1;
        }
        LOG.info("intervals N=" + intervals.size() + " mean-size:" + intervals.stream().mapToInt(R -> R.getLengthOnReference()).average().orElse(0.0));
        final List<GCAndDepth> user_items = new ArrayList<>();
        // split intervals
        for (final Locatable loc : intervals) {
            int pos = loc.getStart();
            while (pos < loc.getEnd()) {
                final int pos_end = Math.min(pos + this.windowSize, loc.getEnd());
                final GCAndDepth dataRow = new GCAndDepth(loc.getContig(), pos, pos_end);
                if (dataRow.getLengthOnReference() < this.windowMin) {
                    break;
                }
                user_items.add(dataRow);
                pos += this.windowShift;
            }
        }
        // free memory
        intervals.clear();
        LOG.info("sorting N=" + user_items.size());
        Collections.sort(user_items, locComparator);
        // fill gc percent
        LOG.info("fill gc% N=" + user_items.size());
        for (final String ctg : user_items.stream().map(T -> T.getContig()).collect(Collectors.toSet())) {
            final GenomicSequence gseq = new GenomicSequence(indexedFastaSequenceFile, ctg);
            for (final GCAndDepth dataRow : user_items) {
                if (!dataRow.getContig().equals(ctg))
                    continue;
                final GCPercent gc = gseq.getGCPercent(dataRow.getStart(), dataRow.getEnd());
                if (gc.isEmpty())
                    continue;
                dataRow.gc = gc.getGCPercent();
            }
        }
        // remove strange gc
        user_items.removeIf(B -> B.gc < this.minGC);
        user_items.removeIf(B -> B.gc > this.maxGC);
        LOG.info("remove high/low gc% N=" + user_items.size());
        if (user_items.stream().allMatch(P -> isSex(P.getContig()))) {
            LOG.error("All chromosomes are defined as sexual. Cannot normalize");
            return -1;
        }
        final CoverageFactory coverageFactory = new CoverageFactory().setMappingQuality(this.mappingQuality);
        try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
            /* header */
            pw.println("#CHROM\tSTART\tEND\tSample\tIDX\tGC\tRAW-DEPTH\tNORM-DEPTH");
            for (final Path bamPath : IOUtils.unrollPaths(args)) {
                // open this samReader
                try (SamReader samReader = super.createSamReaderFactory().referenceSequence(this.refFile).open(bamPath)) {
                    if (!samReader.hasIndex()) {
                        LOG.error("file is not indexed " + bamPath);
                        return -1;
                    }
                    final SAMFileHeader header = samReader.getFileHeader();
                    SequenceUtil.assertSequenceDictionariesEqual(dict, SequenceDictionaryUtils.extractRequired(header));
                    final String sampleName = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtil.isBlank(S)).collect(Collectors.toSet()).stream().collect(HtsCollectors.toSingleton());
                    final List<GCAndDepth> bam_items = new ArrayList<>(user_items.size());
                    /* loop over contigs */
                    for (final SAMSequenceRecord ssr : dict.getSequences()) {
                        /* create a **COPY** of the intervals */
                        final List<GCAndDepth> ctgitems = user_items.stream().filter(T -> T.contigsMatch(ssr)).collect(Collectors.toList());
                        if (ctgitems.isEmpty())
                            continue;
                        LOG.info("Getting coverage for " + ssr.getSequenceName() + " N=" + ctgitems.size());
                        // get coverage
                        final CoverageFactory.SimpleCoverage coverage = coverageFactory.getSimpleCoverage(samReader, ctgitems, sampleName);
                        // fill coverage
                        for (final GCAndDepth gc : ctgitems) {
                            final OptionalDouble optCov;
                            switch(this.univariateDepth) {
                                case median:
                                    optCov = coverage.getMedian(gc);
                                    break;
                                case mean:
                                    optCov = coverage.getAverage(gc);
                                    break;
                                default:
                                    throw new IllegalStateException();
                            }
                            gc.raw_depth = optCov.orElse(-1.0);
                            gc.norm_depth = gc.raw_depth;
                        }
                        ctgitems.removeIf(V -> V.raw_depth < 0);
                        ctgitems.removeIf(V -> V.raw_depth > this.weirdMaxDepth);
                        ctgitems.removeIf(V -> V.raw_depth < this.weirdMinDepth);
                        if (ctgitems.isEmpty())
                            continue;
                        bam_items.addAll(ctgitems);
                    }
                    double[] y = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.raw_depth).toArray();
                    LOG.info("median raw depth " + new Median().evaluate(y, 0, y.length));
                    Collections.sort(bam_items, (a, b) -> {
                        final int i = Double.compare(a.getX(), b.getX());
                        if (i != 0)
                            return i;
                        return Double.compare(a.getY(), b.getY());
                    });
                    double[] x = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.getX()).toArray();
                    y = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.getY()).toArray();
                    // get min GC
                    final double min_x = x[0];
                    // get max GC
                    final double max_x = x[x.length - 1];
                    LOG.info("min/max gc " + min_x + " " + max_x);
                    /* merge adjacent x (GC%) having same values */
                    int i = 0;
                    int k = 0;
                    while (i < x.length) {
                        int j = i + 1;
                        while (j < x.length && Precision.equals(x[i], x[j])) {
                            ++j;
                        }
                        x[k] = x[i];
                        y[k] = this.univariateGCLoess.create().evaluate(y, i, j - i);
                        ++k;
                        i = j;
                    }
                    LOG.info("merged n=" + (x.length - k) + " items.");
                    /* reduce size of x et y */
                    final List<XY> xyL = new ArrayList<>(k);
                    for (int t = 0; t < k; ++t) {
                        xyL.add(new XYImpl(x[t], y[t]));
                    }
                    /* sort on Y */
                    Collections.sort(xyL, (a, b) -> {
                        final int d = Double.compare(a.getX(), b.getX());
                        if (d != 0)
                            return d;
                        return Double.compare(a.getY(), b.getY());
                    });
                    x = xyL.stream().mapToDouble(R -> R.getX()).toArray();
                    y = xyL.stream().mapToDouble(R -> R.getY()).toArray();
                    final UnivariateInterpolator interpolator = createInterpolator();
                    UnivariateFunction spline = null;
                    try {
                        spline = interpolator.interpolate(x, y);
                    } catch (final org.apache.commons.math3.exception.NumberIsTooSmallException err) {
                        spline = null;
                        LOG.error("Cannot use " + interpolator.getClass().getName() + ":" + err.getMessage());
                    }
                    // min depth cal
                    int points_removed = 0;
                    i = 0;
                    while (i < bam_items.size()) {
                        final GCAndDepth r = bam_items.get(i);
                        if (spline == null) {
                            ++i;
                        } else if (r.getX() < min_x || r.getX() > max_x) {
                            bam_items.remove(i);
                            ++points_removed;
                        } else {
                            final double norm;
                            if (this.gcDepthInterpolation.equals(UnivariateInerpolation.identity)) {
                                norm = r.getY();
                            } else {
                                norm = spline.value(r.getX());
                            }
                            if (Double.isNaN(norm) || Double.isInfinite(norm)) {
                                LOG.info("NAN " + r);
                                bam_items.remove(i);
                                ++points_removed;
                                continue;
                            }
                            r.norm_depth = norm;
                            ++i;
                        }
                    }
                    LOG.info("removed " + points_removed + ". now N=" + bam_items.size());
                    if (bam_items.isEmpty())
                        continue;
                    spline = null;
                    // DO NOT NORMALIZE ON MINIMUM DEPTH, STUPID.
                    // normalize on median
                    y = bam_items.stream().mapToDouble(G -> G.getY()).toArray();
                    final double median_depth = this.univariateMid.create().evaluate(y, 0, y.length);
                    LOG.info("median norm depth : " + median_depth);
                    for (i = 0; median_depth > 0 && i < bam_items.size(); ++i) {
                        final GCAndDepth gc = bam_items.get(i);
                        gc.norm_depth /= median_depth;
                    }
                    // restore genomic order
                    Collections.sort(bam_items, locComparator);
                    // smoothing values with neighbours
                    y = bam_items.stream().mapToDouble(V -> V.getY()).toArray();
                    for (i = 0; i < bam_items.size(); ++i) {
                        final GCAndDepth gc = bam_items.get(i);
                        int left = i;
                        for (int j = Math.max(0, i - this.smooth_window); j <= i; ++j) {
                            final GCAndDepth gc2 = bam_items.get(j);
                            if (!gc2.withinDistanceOf(gc, this.smoothDistance))
                                continue;
                            left = j;
                            break;
                        }
                        int right = i;
                        for (int j = i; j <= i + this.smooth_window && j < bam_items.size(); ++j) {
                            final GCAndDepth gc2 = bam_items.get(j);
                            if (!gc2.withinDistanceOf(gc, this.smoothDistance))
                                break;
                            right = j;
                        // no break;
                        }
                        gc.norm_depth = this.univariateSmooth.create().evaluate(y, left, (right - left) + 1);
                    }
                    /* print data */
                    for (final GCAndDepth r : bam_items) {
                        pw.print(r.getContig());
                        pw.print('\t');
                        pw.print(r.getStart() - 1);
                        pw.print('\t');
                        pw.print(r.getEnd());
                        pw.print('\t');
                        pw.print(sampleName);
                        pw.print('\t');
                        pw.print(getGenomicIndex(dict, r.getContig(), r.getStart()) - 1);
                        pw.print('\t');
                        pw.printf("%.3f", r.gc);
                        pw.print('\t');
                        pw.printf("%.3f", r.raw_depth);
                        pw.print('\t');
                        pw.printf("%.3f", r.norm_depth);
                        pw.println();
                    }
                    pw.flush();
                }
            // samReader
            }
            // end loop bamPath
            pw.flush();
        }
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(indexedFastaSequenceFile);
    }
}
Also used : Arrays(java.util.Arrays) Precision(org.apache.commons.math3.util.Precision) NevilleInterpolator(org.apache.commons.math3.analysis.interpolation.NevilleInterpolator) Program(com.github.lindenb.jvarkit.util.jcommander.Program) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) UnivariateInterpolator(org.apache.commons.math3.analysis.interpolation.UnivariateInterpolator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) CoverageFactory(com.github.lindenb.jvarkit.samtools.CoverageFactory) StringUtil(htsjdk.samtools.util.StringUtil) Map(java.util.Map) Median(org.apache.commons.math3.stat.descriptive.rank.Median) Path(java.nio.file.Path) MathIllegalArgumentException(org.apache.commons.math3.exception.MathIllegalArgumentException) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) LinearInterpolator(org.apache.commons.math3.analysis.interpolation.LinearInterpolator) LoessInterpolator(org.apache.commons.math3.analysis.interpolation.LoessInterpolator) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) ReferenceSequenceFileFactory(htsjdk.samtools.reference.ReferenceSequenceFileFactory) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) UnivariateFunction(org.apache.commons.math3.analysis.UnivariateFunction) SplineInterpolator(org.apache.commons.math3.analysis.interpolation.SplineInterpolator) SequenceUtil(htsjdk.samtools.util.SequenceUtil) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) OptionalDouble(java.util.OptionalDouble) HtsCollectors(com.github.lindenb.jvarkit.stream.HtsCollectors) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BedLineReader(com.github.lindenb.jvarkit.bed.BedLineReader) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) AbstractUnivariateStatistic(org.apache.commons.math3.stat.descriptive.AbstractUnivariateStatistic) Mean(org.apache.commons.math3.stat.descriptive.moment.Mean) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Locatable(htsjdk.samtools.util.Locatable) GCPercent(com.github.lindenb.jvarkit.util.picard.GenomicSequence.GCPercent) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) DividedDifferenceInterpolator(org.apache.commons.math3.analysis.interpolation.DividedDifferenceInterpolator) Identity(org.apache.commons.math3.analysis.function.Identity) DimensionMismatchException(org.apache.commons.math3.exception.DimensionMismatchException) SamReader(htsjdk.samtools.SamReader) DynamicParameter(com.beust.jcommander.DynamicParameter) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Comparator(java.util.Comparator) Collections(java.util.Collections) ArrayList(java.util.ArrayList) Median(org.apache.commons.math3.stat.descriptive.rank.Median) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) BedLineReader(com.github.lindenb.jvarkit.bed.BedLineReader) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) SamReader(htsjdk.samtools.SamReader) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) UnivariateInterpolator(org.apache.commons.math3.analysis.interpolation.UnivariateInterpolator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) PrintWriter(java.io.PrintWriter) Path(java.nio.file.Path) GCPercent(com.github.lindenb.jvarkit.util.picard.GenomicSequence.GCPercent) CoverageFactory(com.github.lindenb.jvarkit.samtools.CoverageFactory) UnivariateFunction(org.apache.commons.math3.analysis.UnivariateFunction) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) OptionalDouble(java.util.OptionalDouble) SAMFileHeader(htsjdk.samtools.SAMFileHeader) Locatable(htsjdk.samtools.util.Locatable)

Example 20 with ContigDictComparator

use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.

the class BedNonOverlappingSet method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.extend < 0) {
        LOG.info("extend cannot be negative.");
        return -1;
    }
    ArchiveFactory archiveFactory = null;
    PrintWriter manifest = null;
    BufferedReader br;
    try {
        final Comparator<String> cmpContig;
        if (this.refFile != null) {
            final SAMSequenceDictionary dict = SAMSequenceDictionaryExtractor.extractDictionary(this.refFile);
            cmpContig = new ContigDictComparator(dict);
        } else {
            cmpContig = (A, B) -> A.compareTo(B);
        }
        final Comparator<Locatable> cmpbed = (A, B) -> {
            int i = cmpContig.compare(A.getContig(), B.getContig());
            if (i != 0)
                return i;
            i = Integer.compare(A.getStart(), B.getStart());
            if (i != 0)
                return i;
            i = Integer.compare(A.getEnd(), B.getEnd());
            return i;
        };
        if (this.manifestFile != null) {
            manifest = IOUtils.openPathForPrintWriter(this.manifestFile);
        } else {
            manifest = new PrintWriter(new NullOuputStream());
        }
        if (args.isEmpty()) {
            br = super.openBufferedReader(null);
            scan(br);
            br.close();
        } else {
            for (final String fname : args) {
                br = super.openBufferedReader(fname);
                scan(br);
                br.close();
            }
        }
        if (this.bedsets.isEmpty()) {
            LOG.warn("No set was created");
        }
        archiveFactory = ArchiveFactory.open(this.archivePath);
        if (compressed)
            archiveFactory.setCompressionLevel(0);
        for (int i = 0; i < this.bedsets.size(); i++) {
            final BlockCompressedOutputStream bcos;
            final PrintWriter pw;
            String filename = String.format("%05d", (i + 1));
            filename += FileExtensions.BED;
            if (this.compressed) {
                filename += ".gz";
                bcos = new BlockCompressedOutputStream(archiveFactory.openOuputStream(filename), (Path) null);
                pw = new PrintWriter(bcos);
            } else {
                bcos = null;
                pw = archiveFactory.openWriter(filename);
            }
            final IntervalTreeMap<BedRecord> itm = this.bedsets.get(i);
            final List<BedLine> list = itm.values().stream().map(R -> R.bedline).sorted(cmpbed).collect(Collectors.toList());
            LOG.info("saving " + filename + " " + list.size());
            for (final BedLine bl : list) {
                pw.println(bl.join());
            }
            if (bcos != null)
                bcos.flush();
            pw.flush();
            pw.close();
            manifest.println(filename + "\t" + list.size());
        }
        archiveFactory.close();
        archiveFactory = null;
        manifest.flush();
        manifest.close();
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(archiveFactory);
        CloserUtil.close(manifest);
    }
}
Also used : Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) ArrayList(java.util.ArrayList) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Interval(htsjdk.samtools.util.Interval) BlockCompressedOutputStream(htsjdk.samtools.util.BlockCompressedOutputStream) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) SAMSequenceDictionaryExtractor(htsjdk.variant.utils.SAMSequenceDictionaryExtractor) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) Locatable(htsjdk.samtools.util.Locatable) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) List(java.util.List) FileExtensions(htsjdk.samtools.util.FileExtensions) BufferedReader(java.io.BufferedReader) Comparator(java.util.Comparator) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) Path(java.nio.file.Path) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) BlockCompressedOutputStream(htsjdk.samtools.util.BlockCompressedOutputStream) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IOException(java.io.IOException) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) BufferedReader(java.io.BufferedReader) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) PrintWriter(java.io.PrintWriter) Locatable(htsjdk.samtools.util.Locatable)

Aggregations

ContigDictComparator (com.github.lindenb.jvarkit.util.samtools.ContigDictComparator)28 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)26 Parameter (com.beust.jcommander.Parameter)25 Program (com.github.lindenb.jvarkit.util.jcommander.Program)25 Logger (com.github.lindenb.jvarkit.util.log.Logger)25 Path (java.nio.file.Path)25 List (java.util.List)25 Collectors (java.util.stream.Collectors)23 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)22 ArrayList (java.util.ArrayList)22 Set (java.util.Set)20 HashSet (java.util.HashSet)19 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)18 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)18 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)17 VCFHeader (htsjdk.variant.vcf.VCFHeader)17 ContigNameConverter (com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter)16 CloserUtil (htsjdk.samtools.util.CloserUtil)16 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)16 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)16