Search in sources :

Example 56 with SimpleInterval

use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.

the class StarRetroCopy method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.min_depth < 1) {
        LOG.error("Bad min depth");
        return -1;
    }
    PrintWriter saveInsertionsPw = null;
    SamReader sr = null;
    VariantContextWriter vcw0 = null;
    CloseableIterator<SAMRecord> iter = null;
    SAMFileWriter sfw = null;
    try {
        /* load the reference genome */
        /* create a contig name converter from the REF */
        // open the sam file
        final String input = oneFileOrNull(args);
        final SamReaderFactory srf = super.createSamReaderFactory();
        if (this.faidx != null)
            srf.referenceSequence(this.faidx);
        if (input != null) {
            sr = srf.open(SamInputResource.of(input));
            final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(sr.getFileHeader());
            /* READ KNOWGENES FILES */
            loadGTF(dict);
            if (this.use_bai && !sr.hasIndex()) {
                LOG.warning("Cannot used bai because input is not indexed");
            }
            // if there is a bai, only query the bam in the regions of splicing
            if (this.use_bai && sr.hasIndex()) {
                LOG.info("building intervals...");
                final QueryInterval[] intervals = this.intronTreeMap.keySet().stream().flatMap(intron -> {
                    // we need the reads overlapping the exon bounds
                    final int tid = dict.getSequenceIndex(intron.getContig());
                    final int extend = 1 + Math.max(0, this.minCigarSize);
                    final QueryInterval q1 = new QueryInterval(tid, Math.max(1, intron.getStart() - extend), intron.getStart() + extend);
                    final QueryInterval q2 = new QueryInterval(tid, Math.max(1, intron.getEnd() - extend), intron.getEnd() + extend);
                    return Arrays.stream(new QueryInterval[] { q1, q2 });
                }).sorted().collect(HtsCollectors.optimizedQueryIntervals());
                LOG.debug("Query bam using " + intervals.length + " random access intervals. Please wait...");
                iter = sr.queryOverlapping(intervals);
            } else {
                iter = sr.iterator();
            }
        } else {
            sr = srf.open(SamInputResource.of(stdin()));
            final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(sr.getFileHeader());
            /* READ GTF FILES */
            loadGTF(dict);
            iter = sr.iterator();
        }
        final SAMFileHeader samFileHeader = sr.getFileHeader();
        final SAMSequenceDictionary refDict = SequenceDictionaryUtils.extractRequired(samFileHeader);
        /* save gene writer */
        if (this.saveBedPeTo != null) {
            saveInsertionsPw = super.openPathOrStdoutAsPrintWriter(this.saveBedPeTo);
        } else {
            saveInsertionsPw = NullOuputStream.newPrintWriter();
        }
        if (this.saveBamTo != null) {
            sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(samFileHeader, true, this.saveBamTo);
        }
        final String sample = samFileHeader.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse("SAMPLE");
        final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(samFileHeader).logger(LOG).build();
        final String SAM_ATT_JI = "jI";
        while (iter.hasNext()) {
            final SAMRecord rec = progress.apply(iter.next());
            if (rec.getReadUnmappedFlag())
                continue;
            if (rec.getMappingQuality() < this.min_read_mapq)
                continue;
            if (rec.isSecondaryOrSupplementary())
                continue;
            if (rec.getDuplicateReadFlag())
                continue;
            if (rec.getReadFailsVendorQualityCheckFlag())
                continue;
            boolean save_read_to_bam = false;
            /* save read if it is not properly mapped (problem with size) and he and his mate surround an intron */
            if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && rec.getReferenceIndex().equals(rec.getMateReferenceIndex()) && !rec.getProperPairFlag() && /* MUST NOT be proper pair */
            rec.getReadNegativeStrandFlag() != rec.getMateNegativeStrandFlag()) {
                final SimpleInterval intronInterval;
                if (rec.getEnd() + 1 < rec.getMateAlignmentStart()) {
                    intronInterval = new SimpleInterval(rec.getContig(), rec.getEnd() + 1, rec.getMateAlignmentStart() - 1);
                } else if (SAMUtils.hasMateCigar(rec) && SAMUtils.getMateAlignmentEnd(rec) + 1 < rec.getAlignmentStart()) {
                    intronInterval = new SimpleInterval(rec.getContig(), SAMUtils.getMateAlignmentEnd(rec) + 1, rec.getAlignmentStart() - 1);
                } else {
                    intronInterval = null;
                }
                if (intronInterval != null) {
                    if (this.intronTreeMap.getOverlapping(intronInterval).stream().flatMap(L -> L.stream()).anyMatch(S -> intronInterval.contains(S))) {
                        save_read_to_bam = true;
                    }
                }
            }
            /* WE use STAR DATA */
            if (!this.use_cigar_string) {
                if (!rec.hasAttribute(SAM_ATT_JI))
                    continue;
                final Object tagValue = rec.getAttribute(SAM_ATT_JI);
                paranoid.assertTrue((tagValue instanceof int[]));
                final int[] bounds = (int[]) tagValue;
                // jI:B:i,-1
                if (bounds.length == 1 && bounds[0] < 0)
                    continue;
                if (bounds.length % 2 != 0) {
                    LOG.warn("bound.length%2!=0 with " + rec.getSAMString());
                    continue;
                }
                for (int i = 0; i < bounds.length; i += 2) {
                    int intron_start = bounds[i];
                    int intron_end = bounds[i + 1];
                    final Interval r = new Interval(rec.getContig(), intron_start, intron_end);
                    // don't use overlapping : with STAR it is strictly the intron boundaries
                    final List<Segment> introns = this.intronTreeMap.get(r);
                    if (introns == null)
                        continue;
                    save_read_to_bam = true;
                    for (final Segment intron : introns) {
                        intron.match++;
                    }
                }
            } else /* WE use other bam like bwa-mem */
            {
                final Cigar cigar = rec.getCigar();
                if (cigar == null || cigar.numCigarElements() < 2)
                    continue;
                int ref1 = rec.getAlignmentStart();
                for (final CigarElement ce : cigar.getCigarElements()) {
                    final CigarOperator op = ce.getOperator();
                    final int ref_end = ref1 + (op.consumesReferenceBases() ? ce.getLength() : 0);
                    if (op.equals(CigarOperator.N) || op.equals(CigarOperator.D)) {
                        final Interval r = new Interval(rec.getContig(), ref1, ref_end - 1);
                        final List<Segment> introns = this.intronTreeMap.get(r);
                        if (introns == null)
                            continue;
                        save_read_to_bam = true;
                        for (final Segment intron : introns) {
                            intron.match++;
                        }
                    }
                    ref1 = ref_end;
                }
                /**
                 * 2019-07-29. I tried using SA:Z:tag doesn't work well , so let's look a the clipping only
                 */
                if (cigar.isClipped()) {
                    for (int side = 0; side < 2; side++) {
                        final Interval r;
                        if (side == 0 && cigar.isRightClipped() && cigar.getLastCigarElement().getLength() >= this.minCigarSize) {
                            r = new Interval(rec.getContig(), rec.getEnd() + 1, rec.getEnd() + 1 + this.minCigarSize);
                        } else if (side == 1 && cigar.isLeftClipped() && cigar.getFirstCigarElement().getLength() >= this.minCigarSize) {
                            r = new Interval(rec.getContig(), Math.max(1, rec.getStart() - (1 + this.minCigarSize)), Math.max(1, rec.getStart() - (1)));
                        } else {
                            continue;
                        }
                        final int final_side = side;
                        final List<Segment> introns = this.intronTreeMap.getOverlapping(r).stream().flatMap(L -> L.stream()).filter(SEG -> isWithinIntronBound(SEG, r, final_side)).collect(Collectors.toList());
                        if (introns.isEmpty())
                            continue;
                        // System.err.println("SA for "+r+" "+rec.getReadName()+" "+introns.size());
                        save_read_to_bam = true;
                        for (final Segment intron : introns) {
                            intron.match++;
                        }
                    }
                }
            }
            if (save_read_to_bam) {
                saveInsertion(saveInsertionsPw, rec);
                if (sfw != null)
                    sfw.addAlignment(rec);
            }
        }
        final ContigDictComparator contigCmp = new ContigDictComparator(refDict);
        this.all_transcripts.removeIf(T -> T.segments.stream().noneMatch(S -> S.match >= min_depth));
        final int max_introns = this.all_transcripts.stream().mapToInt(K -> K.segments.size()).max().orElse(1);
        final List<String> intron_names = IntStream.range(0, max_introns).mapToObj(IDX -> String.format("%s_INTRON_%04d", sample, 1 + IDX)).collect(Collectors.toList());
        /**
         * build vcf header
         */
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_QUALITY_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY, true));
        metaData.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Variation type"));
        metaData.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Variation Length"));
        for (final String att : ENSEMBL_TRANSCRIPT_ATTS) {
            metaData.add(new VCFInfoHeaderLine(att, 1, VCFHeaderLineType.String, "Value for the attribute '" + att + "' in the gtf"));
        }
        // metaData.add(new VCFFormatHeaderLine(ATT_COUNT_SUPPORTING_READS, 2,VCFHeaderLineType.Integer,"Count supporting reads [intron-left/intron-right]"));
        // metaData.add(new VCFInfoHeaderLine(ATT_RETRO_DESC, VCFHeaderLineCount.UNBOUNDED,VCFHeaderLineType.String,
        // "Retrocopy attributes: transcript-id|strand|exon-left|exon-left-bases|exon-right-bases|exon-right"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns for the Transcript"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns found retrocopied for the transcript"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_FRACTION, 1, VCFHeaderLineType.Float, "Fraction of introns found retrocopied for the transcript"));
        metaData.add(new VCFFormatHeaderLine(INTRON_START, 1, VCFHeaderLineType.Integer, "Introns start"));
        metaData.add(new VCFFormatHeaderLine(INTRON_END, 1, VCFHeaderLineType.Integer, "Introns end"));
        metaData.add(new VCFFilterHeaderLine(ATT_LOW_DEPTH_FILTER + this.low_depth_threshold, "Number of read is lower than :" + this.min_depth));
        metaData.add(new VCFFilterHeaderLine(ATT_NOT_ALL_INTRONS, "Not all introns were found retrocopied"));
        final VCFHeader header = new VCFHeader(metaData, intron_names);
        JVarkitVersion.getInstance().addMetaData(this, header);
        header.setSequenceDictionary(refDict);
        /* open vcf for writing*/
        vcw0 = super.openVariantContextWriter(this.outputFile);
        final VariantContextWriter vcw = vcw0;
        vcw.writeHeader(header);
        Collections.sort(this.all_transcripts, (A, B) -> {
            int i = contigCmp.compare(A.getContig(), B.getContig());
            if (i != 0)
                return i;
            i = Integer.compare(A.getStart(), B.getStart());
            if (i != 0)
                return i;
            return Integer.compare(A.getEnd(), B.getEnd());
        });
        final Allele ref = Allele.create((byte) 'N', true);
        final Allele alt = Allele.create("<RETROCOPY>", false);
        for (final Transcript kg : this.all_transcripts) {
            // ok good candidate
            final VariantContextBuilder vcb = new VariantContextBuilder();
            vcb.chr(kg.getContig());
            vcb.start(kg.getStart());
            vcb.stop(kg.getEnd());
            vcb.id(kg.transcript_id);
            final List<Allele> alleles = Arrays.asList(ref, alt);
            final int max_depth = kg.segments.stream().mapToInt(X -> X.match).max().orElse(0);
            vcb.attribute(VCFConstants.DEPTH_KEY, max_depth);
            vcb.log10PError(max_depth / -10.0);
            boolean filter_set = false;
            if (max_depth < this.low_depth_threshold) {
                vcb.filter(ATT_LOW_DEPTH_FILTER + this.low_depth_threshold);
                filter_set = true;
            }
            vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, 2);
            vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, 1);
            vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, 0.5);
            vcb.attribute(VCFConstants.SVTYPE, "DEL");
            vcb.attribute(VCFConstants.END_KEY, kg.getEnd());
            vcb.attribute("SVLEN", kg.getLengthOnReference());
            for (final String att : kg.attributes.keySet()) {
                vcb.attribute(att, VCFUtils.escapeInfoField(kg.attributes.get(att)));
            }
            vcb.alleles(alleles);
            // introns sequences
            vcb.attribute(ATT_INTRONS_CANDIDATE_COUNT, kg.segments.stream().filter(I -> I.match > 0).count());
            vcb.attribute(ATT_INTRONS_COUNT, kg.segments.size());
            vcb.attribute(ATT_INTRONS_CANDIDATE_FRACTION, kg.segments.stream().filter(I -> I.match > 0).count() / (float) kg.segments.size());
            if (kg.segments.stream().filter(I -> I.match > 0).count() != kg.segments.size()) {
                vcb.filter(ATT_NOT_ALL_INTRONS);
                filter_set = true;
            }
            final List<Genotype> genotypes = new ArrayList<>(kg.segments.size());
            /* build genotypes */
            for (int i = 0; i < kg.segments.size(); i++) {
                final Segment intron = kg.segments.get(i);
                final GenotypeBuilder gb = new GenotypeBuilder(intron_names.get(i), Arrays.asList(ref, alt));
                gb.DP(intron.match);
                gb.attribute(INTRON_START, intron.start);
                gb.attribute(INTRON_END, intron.end);
                genotypes.add(gb.make());
            }
            vcb.genotypes(genotypes);
            if (!filter_set) {
                vcb.passFilters();
            }
            vcw.add(vcb.make());
        }
        progress.close();
        vcw.close();
        iter.close();
        iter = null;
        sr.close();
        sr = null;
        saveInsertionsPw.flush();
        saveInsertionsPw.close();
        saveInsertionsPw = null;
        if (sfw != null) {
            sfw.close();
            sfw = null;
        }
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(sr);
        CloserUtil.close(vcw0);
        CloserUtil.close(sfw);
        CloserUtil.close(saveInsertionsPw);
    }
}
Also used : Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) Map(java.util.Map) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Predicate(java.util.function.Predicate) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) SAMFileWriter(htsjdk.samtools.SAMFileWriter) Collectors(java.util.stream.Collectors) GTFCodec(com.github.lindenb.jvarkit.util.bio.gtf.GTFCodec) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) SamReaderFactory(htsjdk.samtools.SamReaderFactory) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) IntStream(java.util.stream.IntStream) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) SAMUtils(htsjdk.samtools.SAMUtils) Parameter(com.beust.jcommander.Parameter) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) HtsCollectors(com.github.lindenb.jvarkit.stream.HtsCollectors) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Interval(htsjdk.samtools.util.Interval) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFConstants(htsjdk.variant.vcf.VCFConstants) Locatable(htsjdk.samtools.util.Locatable) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) SAMFileWriterFactory(htsjdk.samtools.SAMFileWriterFactory) Files(java.nio.file.Files) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) SamReader(htsjdk.samtools.SamReader) File(java.io.File) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) SamInputResource(htsjdk.samtools.SamInputResource) QueryInterval(htsjdk.samtools.QueryInterval) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Paranoid(com.github.lindenb.jvarkit.lang.Paranoid) BufferedReader(java.io.BufferedReader) Collections(java.util.Collections) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) ArrayList(java.util.ArrayList) QueryInterval(htsjdk.samtools.QueryInterval) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) SamReader(htsjdk.samtools.SamReader) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) PrintWriter(java.io.PrintWriter) HashSet(java.util.HashSet) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) SamReaderFactory(htsjdk.samtools.SamReaderFactory) SAMFileWriter(htsjdk.samtools.SAMFileWriter) SAMFileWriterFactory(htsjdk.samtools.SAMFileWriterFactory) Genotype(htsjdk.variant.variantcontext.Genotype) CigarOperator(htsjdk.samtools.CigarOperator) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) CigarElement(htsjdk.samtools.CigarElement) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOException(java.io.IOException) Allele(htsjdk.variant.variantcontext.Allele) Cigar(htsjdk.samtools.Cigar) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) Interval(htsjdk.samtools.util.Interval) QueryInterval(htsjdk.samtools.QueryInterval)

Example 57 with SimpleInterval

use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.

the class StarRetroCopy method saveInsertion.

private void saveInsertion(final PrintWriter pw, final SAMRecord rec) {
    if (this.saveBedPeTo == null)
        return;
    final List<SimpleInterval> rec2list = new ArrayList<>();
    final Predicate<Locatable> shouldBeSaved = REC2 -> {
        if (!rec.getContig().equals(REC2.getContig())) {
            return true;
        }
        if (rec.overlaps(REC2))
            return false;
        return this.intronTreeMap.getOverlapping(rec).stream().flatMap(L -> L.stream()).map(L -> L.transcript).noneMatch(P -> P.overlaps(REC2));
    };
    if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && !rec.getProperPairFlag()) {
        final int mateStart = rec.getMateAlignmentStart();
        final int mateEnd;
        if (SAMUtils.hasMateCigar(rec)) {
            mateEnd = SAMUtils.getMateAlignmentEnd(rec);
        } else {
            mateEnd = mateStart;
        }
        final SimpleInterval rec2 = new SimpleInterval(rec.getMateReferenceName(), mateStart, mateEnd);
        if (shouldBeSaved.test(rec2)) {
            rec2list.add(rec2);
        }
    }
    /* add supplementary alignments that could be an evidence */
    SAMUtils.getOtherCanonicalAlignments(rec).stream().filter(shouldBeSaved).forEach(R -> rec2list.add(new SimpleInterval(R)));
    if (rec2list.isEmpty())
        return;
    for (final SimpleInterval rec2 : rec2list) {
        final String distanceStr;
        if (rec.overlaps(rec2)) {
            distanceStr = "0";
        } else if (rec.contigsMatch(rec2)) {
            distanceStr = String.valueOf(Math.abs(Math.min(rec.getStart() - rec2.getEnd(), rec2.getStart() - rec.getEnd())));
        } else {
            distanceStr = ".";
        }
        // save insertions
        pw.print(rec.getContig());
        pw.print("\t");
        pw.print(rec.getStart() - 1);
        pw.print("\t");
        pw.print(rec.getEnd());
        pw.print("\t");
        pw.print(rec2.getContig());
        pw.print("\t");
        pw.print(rec2.getStart() - 1);
        pw.print("\t");
        pw.print(rec2.getEnd());
        pw.print("\t");
        // name
        pw.print(rec.getReadName() + (rec.getReadPairedFlag() ? (rec.getFirstOfPairFlag() ? "/1" : "/2") : ""));
        pw.print("\t");
        // score
        pw.print(".");
        pw.print("\t");
        // strand 1
        pw.print(".");
        pw.print("\t");
        // strand 2
        pw.print(".");
        pw.print("\t");
        // "Any number of additional, user-defined fields ..."
        pw.print(distanceStr);
        pw.println();
    }
}
Also used : Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) Map(java.util.Map) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Predicate(java.util.function.Predicate) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) SAMFileWriter(htsjdk.samtools.SAMFileWriter) Collectors(java.util.stream.Collectors) GTFCodec(com.github.lindenb.jvarkit.util.bio.gtf.GTFCodec) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) SamReaderFactory(htsjdk.samtools.SamReaderFactory) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) IntStream(java.util.stream.IntStream) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) SAMUtils(htsjdk.samtools.SAMUtils) Parameter(com.beust.jcommander.Parameter) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) HtsCollectors(com.github.lindenb.jvarkit.stream.HtsCollectors) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Interval(htsjdk.samtools.util.Interval) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFConstants(htsjdk.variant.vcf.VCFConstants) Locatable(htsjdk.samtools.util.Locatable) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) SAMFileWriterFactory(htsjdk.samtools.SAMFileWriterFactory) Files(java.nio.file.Files) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) SamReader(htsjdk.samtools.SamReader) File(java.io.File) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) SamInputResource(htsjdk.samtools.SamInputResource) QueryInterval(htsjdk.samtools.QueryInterval) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Paranoid(com.github.lindenb.jvarkit.lang.Paranoid) BufferedReader(java.io.BufferedReader) Collections(java.util.Collections) ArrayList(java.util.ArrayList) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) Locatable(htsjdk.samtools.util.Locatable)

Example 58 with SimpleInterval

use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.

the class SAM4WebLogo method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.fastq_quality_padding_str.length() != 1) {
        LOG.error("Bad fastq padding character (length!=1)");
        return -1;
    }
    if (this.fastq_quality_unknown_str.length() != 1) {
        LOG.error("Bad fastq unknown character (length!=1)");
        return -1;
    }
    PrintWriter out = null;
    SamReader samReader = null;
    SAMRecordIterator iter = null;
    try {
        final SamReaderFactory srf = super.createSamReaderFactory();
        if (this.faidx != null)
            srf.referenceSequence(this.faidx);
        final List<Locatable> intervals = this.intervalListProvider.stream().collect(Collectors.toList());
        if (!this.output_format.equals(Format.tabular) && intervals.size() > 1) {
            LOG.error("Only one interval is allowed for format " + this.output_format);
            return -1;
        }
        final List<Path> inputPath = IOUtils.unrollPaths(args);
        out = super.openPathOrStdoutAsPrintWriter(outputFile);
        for (int locatable_idx = 0; locatable_idx < intervals.size(); ++locatable_idx) {
            if (locatable_idx > 0)
                out.println();
            final Locatable interval = intervals.get(locatable_idx);
            final List<SAMRecord> buffer = new ArrayList<>();
            final SortedMap<Integer, Integer> genomicpos2insertlen = new TreeMap<>();
            for (final Path samPath : inputPath) {
                samReader = srf.open(SamInputResource.of(samPath));
                final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(samReader.getFileHeader());
                final ContigNameConverter ctgNameConverter = ContigNameConverter.fromOneDictionary(dict);
                final SimpleInterval segment = ctgNameConverter.convertToSimpleInterval(interval).orElseThrow(() -> new JvarkitException.ContigNotFoundInDictionary(interval.getContig(), dict));
                final int extend = useClip ? 1000 : 0;
                iter = samReader.query(segment.getContig(), Math.max(1, segment.getStart() - extend), segment.getEnd() + extend, false);
                while (iter.hasNext()) {
                    final SAMRecord rec = iter.next();
                    if (rec.getReadUnmappedFlag())
                        continue;
                    if (this.SamRecordFilter.filterOut(rec))
                        continue;
                    final Cigar cigar = rec.getCigar();
                    if (cigar == null || cigar.isEmpty())
                        continue;
                    if (!rec.getReferenceName().equals(segment.getContig()))
                        continue;
                    if (useClip) {
                        if (rec.getUnclippedEnd() < segment.getStart())
                            continue;
                        if (rec.getUnclippedStart() > segment.getEnd())
                            continue;
                    } else {
                        if (rec.getEnd() < segment.getStart())
                            continue;
                        if (rec.getStart() > segment.getEnd())
                            continue;
                    }
                    // free memory
                    if (!this.output_format.equals(Format.fastq)) {
                        rec.setBaseQualities(SAMRecord.NULL_QUALS);
                    }
                    final List<SAMTagAndValue> atts = rec.getAttributes();
                    for (final SAMTagAndValue stv : atts) {
                        if (stv.tag.equals("RG"))
                            continue;
                        rec.setAttribute(stv.tag, null);
                    }
                    int refPos = rec.getStart();
                    for (final CigarElement ce : cigar) {
                        final CigarOperator op = ce.getOperator();
                        if (op.equals(CigarOperator.I)) {
                            genomicpos2insertlen.put(refPos, Math.max(ce.getLength(), genomicpos2insertlen.getOrDefault(refPos, 0)));
                        }
                        if (op.consumesReferenceBases())
                            refPos += ce.getLength();
                        if (refPos > interval.getEnd())
                            break;
                    }
                    buffer.add(rec);
                }
                iter.close();
                iter = null;
                samReader.close();
                samReader = null;
            }
            /* compute columns */
            if (interval != null) {
                /**
                 * test if genomic position is in interval
                 */
                final IntPredicate testInInterval = pos -> interval.getStart() <= pos && pos <= interval.getEnd();
                final ArrayList<Integer> column2genomic = new ArrayList<>(interval.getLengthOnReference());
                for (int x = interval.getStart(); x <= interval.getEnd(); ++x) {
                    column2genomic.add(x);
                }
                // insert insertions in reverse order
                for (int pos : genomicpos2insertlen.keySet().stream().sorted(Collections.reverseOrder()).collect(Collectors.toList())) {
                    if (!testInInterval.test(pos))
                        continue;
                    final int insert_len = genomicpos2insertlen.get(pos);
                    for (int x = 0; x < insert_len; ++x) {
                        column2genomic.add(pos - interval.getStart(), NO_POS);
                    }
                }
                // ruler
                if (this.output_format.equals(Format.tabular)) {
                    int x = 0;
                    while (x < column2genomic.size()) {
                        if (column2genomic.get(x) == NO_POS || column2genomic.get(x) % 10 != 0) {
                            out.print(' ');
                            ++x;
                        } else {
                            final String s = String.valueOf(column2genomic.get(x));
                            if (s.length() < 10 && x + s.length() < column2genomic.size()) {
                                out.print(s);
                                x += s.length();
                            } else {
                                out.print(' ');
                                ++x;
                            }
                        }
                    }
                    out.println(" " + interval.toString());
                }
                for (final SAMRecord rec : buffer) {
                    final String recQualString = rec.getBaseQualityString();
                    final Function<Integer, Character> read2qual;
                    if (recQualString == null || SAMRecord.NULL_QUALS_STRING.equals(recQualString)) {
                        read2qual = IDX -> '~';
                    } else {
                        read2qual = IDX -> {
                            if (IDX < 0 || IDX >= recQualString.length())
                                return '~';
                            return recQualString.charAt(IDX);
                        };
                    }
                    final byte[] rec_bases = rec.getReadBases();
                    final Function<Integer, Character> read2base;
                    if (SAMRecord.NULL_SEQUENCE.equals(rec_bases)) {
                        read2base = IDX -> '?';
                    } else {
                        read2base = IDX -> {
                            if (IDX < 0 || IDX >= rec_bases.length)
                                return '?';
                            return (char) rec_bases[IDX];
                        };
                    }
                    final StringBuilder seqBuilder = new StringBuilder(column2genomic.size());
                    final StringBuilder qualBuilder = new StringBuilder(column2genomic.size());
                    int x = 0;
                    int readRef = rec.getUnclippedStart();
                    int readpos = 0;
                    final Cigar cigar = rec.getCigar();
                    for (final CigarElement ce : cigar) {
                        final CigarOperator op = ce.getOperator();
                        if (op.equals(CigarOperator.PADDING))
                            continue;
                        for (int cigarIdx = 0; cigarIdx < ce.getLength(); ++cigarIdx) {
                            // pad before base
                            while (!op.equals(CigarOperator.INSERTION) && x < column2genomic.size() && (column2genomic.get(x) == NO_POS || column2genomic.get(x) < readRef)) {
                                seqBuilder.append('-');
                                qualBuilder.append('-');
                                ++x;
                            }
                            switch(op) {
                                case I:
                                    {
                                        if (!this.hide_insertions) {
                                            if (testInInterval.test(readRef)) {
                                                seqBuilder.append(Character.toLowerCase(read2base.apply(readpos)));
                                                qualBuilder.append(read2qual.apply(readpos));
                                                ++x;
                                            }
                                        }
                                        ++cigarIdx;
                                        ++readpos;
                                        break;
                                    }
                                case P:
                                    break;
                                case H:
                                    {
                                        if (testInInterval.test(readRef)) {
                                            seqBuilder.append(this.useClip ? 'n' : '-');
                                            qualBuilder.append(this.useClip ? this.fastq_quality_unknown_str : "-");
                                            ++x;
                                        }
                                        ++readRef;
                                        break;
                                    }
                                case S:
                                    {
                                        if (testInInterval.test(readRef)) {
                                            seqBuilder.append(this.useClip ? Character.toLowerCase((read2base.apply(readpos))) : '-');
                                            qualBuilder.append(this.useClip ? (char) read2qual.apply(readpos) : '-');
                                            ++x;
                                        }
                                        ++readpos;
                                        ++readRef;
                                        break;
                                    }
                                case D:
                                case N:
                                    {
                                        if (testInInterval.test(readRef)) {
                                            seqBuilder.append('>');
                                            qualBuilder.append('>');
                                            ++x;
                                        }
                                        ++readRef;
                                        break;
                                    }
                                case X:
                                case EQ:
                                case M:
                                    {
                                        if (testInInterval.test(readRef)) {
                                            seqBuilder.append(read2base.apply(readpos));
                                            qualBuilder.append(read2qual.apply(readpos));
                                            ++x;
                                        }
                                        ++readpos;
                                        ++readRef;
                                        break;
                                    }
                                default:
                                    throw new IllegalArgumentException(op.name());
                            }
                        }
                    // end cigarIdx
                    }
                    /* right pad */
                    while (x < column2genomic.size()) {
                        seqBuilder.append('-');
                        qualBuilder.append('-');
                        ++x;
                    }
                    final String readName = this.samRecordNaming.apply(rec);
                    switch(this.output_format) {
                        case fastq:
                            out.print(FastqConstants.SEQUENCE_HEADER);
                            out.print(readName);
                            out.println();
                            out.print(seqBuilder);
                            out.println();
                            out.println(FastqConstants.QUALITY_HEADER);
                            out.println(qualBuilder);
                            break;
                        case tabular:
                            out.print(seqBuilder);
                            out.print(' ');
                            out.println(readName);
                            break;
                        default:
                            out.print('>');
                            out.print(readName);
                            out.println();
                            out.print(seqBuilder);
                            out.println();
                            break;
                    }
                }
            // end loop over SAMRec
            }
        // end if interval !=null
        }
        out.flush();
        out.close();
        out = null;
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(samReader);
        CloserUtil.close(out);
    }
}
Also used : SAMRecordNaming(com.github.lindenb.jvarkit.samtools.SAMRecordNaming) Cigar(htsjdk.samtools.Cigar) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) IntervalListProvider(com.github.lindenb.jvarkit.samtools.util.IntervalListProvider) Function(java.util.function.Function) IntPredicate(java.util.function.IntPredicate) ArrayList(java.util.ArrayList) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) Locatable(htsjdk.samtools.util.Locatable) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SamReader(htsjdk.samtools.SamReader) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) SAMRecord(htsjdk.samtools.SAMRecord) SAMTagAndValue(htsjdk.samtools.SAMRecord.SAMTagAndValue) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) List(java.util.List) SamRecordJEXLFilter(com.github.lindenb.jvarkit.util.samtools.SamRecordJEXLFilter) SamInputResource(htsjdk.samtools.SamInputResource) TreeMap(java.util.TreeMap) Collections(java.util.Collections) SamReaderFactory(htsjdk.samtools.SamReaderFactory) SortedMap(java.util.SortedMap) FastqConstants(htsjdk.samtools.fastq.FastqConstants) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) ArrayList(java.util.ArrayList) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SamReader(htsjdk.samtools.SamReader) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) PrintWriter(java.io.PrintWriter) Path(java.nio.file.Path) SamReaderFactory(htsjdk.samtools.SamReaderFactory) IntPredicate(java.util.function.IntPredicate) CigarOperator(htsjdk.samtools.CigarOperator) TreeMap(java.util.TreeMap) CigarElement(htsjdk.samtools.CigarElement) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) Cigar(htsjdk.samtools.Cigar) SAMRecord(htsjdk.samtools.SAMRecord) SAMTagAndValue(htsjdk.samtools.SAMRecord.SAMTagAndValue) Locatable(htsjdk.samtools.util.Locatable)

Example 59 with SimpleInterval

use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.

the class CopyNumber01 method doWork.

@Override
public int doWork(final List<String> args) {
    ReferenceSequenceFile indexedFastaSequenceFile = null;
    try {
        this.sexContigSet.addAll(Arrays.stream(this.sexContigStr.split("[, \t]")).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toSet()));
        /* loading REF Reference */
        indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile);
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(indexedFastaSequenceFile);
        final Comparator<Locatable> locComparator = new ContigDictComparator(dict).createLocatableComparator();
        final List<Locatable> intervals = new ArrayList<>();
        if (this.bedFile == null) {
            for (final Locatable loc : dict.getSequences()) {
                intervals.add(loc);
            }
        } else {
            final ContigNameConverter converter = ContigNameConverter.fromOneDictionary(dict);
            try (BedLineReader br = new BedLineReader(this.bedFile)) {
                br.stream().filter(L -> !StringUtil.isBlank(converter.apply(L.getContig()))).forEach(B -> {
                    final String ctg = converter.apply(B.getContig());
                    intervals.add(new SimpleInterval(ctg, B.getStart(), B.getEnd()));
                });
            }
        }
        if (intervals.isEmpty()) {
            LOG.error("no interval defined.");
            return -1;
        }
        LOG.info("intervals N=" + intervals.size() + " mean-size:" + intervals.stream().mapToInt(R -> R.getLengthOnReference()).average().orElse(0.0));
        final List<GCAndDepth> user_items = new ArrayList<>();
        // split intervals
        for (final Locatable loc : intervals) {
            int pos = loc.getStart();
            while (pos < loc.getEnd()) {
                final int pos_end = Math.min(pos + this.windowSize, loc.getEnd());
                final GCAndDepth dataRow = new GCAndDepth(loc.getContig(), pos, pos_end);
                if (dataRow.getLengthOnReference() < this.windowMin) {
                    break;
                }
                user_items.add(dataRow);
                pos += this.windowShift;
            }
        }
        // free memory
        intervals.clear();
        LOG.info("sorting N=" + user_items.size());
        Collections.sort(user_items, locComparator);
        // fill gc percent
        LOG.info("fill gc% N=" + user_items.size());
        for (final String ctg : user_items.stream().map(T -> T.getContig()).collect(Collectors.toSet())) {
            final GenomicSequence gseq = new GenomicSequence(indexedFastaSequenceFile, ctg);
            for (final GCAndDepth dataRow : user_items) {
                if (!dataRow.getContig().equals(ctg))
                    continue;
                final GCPercent gc = gseq.getGCPercent(dataRow.getStart(), dataRow.getEnd());
                if (gc.isEmpty())
                    continue;
                dataRow.gc = gc.getGCPercent();
            }
        }
        // remove strange gc
        user_items.removeIf(B -> B.gc < this.minGC);
        user_items.removeIf(B -> B.gc > this.maxGC);
        LOG.info("remove high/low gc% N=" + user_items.size());
        if (user_items.stream().allMatch(P -> isSex(P.getContig()))) {
            LOG.error("All chromosomes are defined as sexual. Cannot normalize");
            return -1;
        }
        final CoverageFactory coverageFactory = new CoverageFactory().setMappingQuality(this.mappingQuality);
        try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
            /* header */
            pw.println("#CHROM\tSTART\tEND\tSample\tIDX\tGC\tRAW-DEPTH\tNORM-DEPTH");
            for (final Path bamPath : IOUtils.unrollPaths(args)) {
                // open this samReader
                try (SamReader samReader = super.createSamReaderFactory().referenceSequence(this.refFile).open(bamPath)) {
                    if (!samReader.hasIndex()) {
                        LOG.error("file is not indexed " + bamPath);
                        return -1;
                    }
                    final SAMFileHeader header = samReader.getFileHeader();
                    SequenceUtil.assertSequenceDictionariesEqual(dict, SequenceDictionaryUtils.extractRequired(header));
                    final String sampleName = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtil.isBlank(S)).collect(Collectors.toSet()).stream().collect(HtsCollectors.toSingleton());
                    final List<GCAndDepth> bam_items = new ArrayList<>(user_items.size());
                    /* loop over contigs */
                    for (final SAMSequenceRecord ssr : dict.getSequences()) {
                        /* create a **COPY** of the intervals */
                        final List<GCAndDepth> ctgitems = user_items.stream().filter(T -> T.contigsMatch(ssr)).collect(Collectors.toList());
                        if (ctgitems.isEmpty())
                            continue;
                        LOG.info("Getting coverage for " + ssr.getSequenceName() + " N=" + ctgitems.size());
                        // get coverage
                        final CoverageFactory.SimpleCoverage coverage = coverageFactory.getSimpleCoverage(samReader, ctgitems, sampleName);
                        // fill coverage
                        for (final GCAndDepth gc : ctgitems) {
                            final OptionalDouble optCov;
                            switch(this.univariateDepth) {
                                case median:
                                    optCov = coverage.getMedian(gc);
                                    break;
                                case mean:
                                    optCov = coverage.getAverage(gc);
                                    break;
                                default:
                                    throw new IllegalStateException();
                            }
                            gc.raw_depth = optCov.orElse(-1.0);
                            gc.norm_depth = gc.raw_depth;
                        }
                        ctgitems.removeIf(V -> V.raw_depth < 0);
                        ctgitems.removeIf(V -> V.raw_depth > this.weirdMaxDepth);
                        ctgitems.removeIf(V -> V.raw_depth < this.weirdMinDepth);
                        if (ctgitems.isEmpty())
                            continue;
                        bam_items.addAll(ctgitems);
                    }
                    double[] y = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.raw_depth).toArray();
                    LOG.info("median raw depth " + new Median().evaluate(y, 0, y.length));
                    Collections.sort(bam_items, (a, b) -> {
                        final int i = Double.compare(a.getX(), b.getX());
                        if (i != 0)
                            return i;
                        return Double.compare(a.getY(), b.getY());
                    });
                    double[] x = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.getX()).toArray();
                    y = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.getY()).toArray();
                    // get min GC
                    final double min_x = x[0];
                    // get max GC
                    final double max_x = x[x.length - 1];
                    LOG.info("min/max gc " + min_x + " " + max_x);
                    /* merge adjacent x (GC%) having same values */
                    int i = 0;
                    int k = 0;
                    while (i < x.length) {
                        int j = i + 1;
                        while (j < x.length && Precision.equals(x[i], x[j])) {
                            ++j;
                        }
                        x[k] = x[i];
                        y[k] = this.univariateGCLoess.create().evaluate(y, i, j - i);
                        ++k;
                        i = j;
                    }
                    LOG.info("merged n=" + (x.length - k) + " items.");
                    /* reduce size of x et y */
                    final List<XY> xyL = new ArrayList<>(k);
                    for (int t = 0; t < k; ++t) {
                        xyL.add(new XYImpl(x[t], y[t]));
                    }
                    /* sort on Y */
                    Collections.sort(xyL, (a, b) -> {
                        final int d = Double.compare(a.getX(), b.getX());
                        if (d != 0)
                            return d;
                        return Double.compare(a.getY(), b.getY());
                    });
                    x = xyL.stream().mapToDouble(R -> R.getX()).toArray();
                    y = xyL.stream().mapToDouble(R -> R.getY()).toArray();
                    final UnivariateInterpolator interpolator = createInterpolator();
                    UnivariateFunction spline = null;
                    try {
                        spline = interpolator.interpolate(x, y);
                    } catch (final org.apache.commons.math3.exception.NumberIsTooSmallException err) {
                        spline = null;
                        LOG.error("Cannot use " + interpolator.getClass().getName() + ":" + err.getMessage());
                    }
                    // min depth cal
                    int points_removed = 0;
                    i = 0;
                    while (i < bam_items.size()) {
                        final GCAndDepth r = bam_items.get(i);
                        if (spline == null) {
                            ++i;
                        } else if (r.getX() < min_x || r.getX() > max_x) {
                            bam_items.remove(i);
                            ++points_removed;
                        } else {
                            final double norm;
                            if (this.gcDepthInterpolation.equals(UnivariateInerpolation.identity)) {
                                norm = r.getY();
                            } else {
                                norm = spline.value(r.getX());
                            }
                            if (Double.isNaN(norm) || Double.isInfinite(norm)) {
                                LOG.info("NAN " + r);
                                bam_items.remove(i);
                                ++points_removed;
                                continue;
                            }
                            r.norm_depth = norm;
                            ++i;
                        }
                    }
                    LOG.info("removed " + points_removed + ". now N=" + bam_items.size());
                    if (bam_items.isEmpty())
                        continue;
                    spline = null;
                    // DO NOT NORMALIZE ON MINIMUM DEPTH, STUPID.
                    // normalize on median
                    y = bam_items.stream().mapToDouble(G -> G.getY()).toArray();
                    final double median_depth = this.univariateMid.create().evaluate(y, 0, y.length);
                    LOG.info("median norm depth : " + median_depth);
                    for (i = 0; median_depth > 0 && i < bam_items.size(); ++i) {
                        final GCAndDepth gc = bam_items.get(i);
                        gc.norm_depth /= median_depth;
                    }
                    // restore genomic order
                    Collections.sort(bam_items, locComparator);
                    // smoothing values with neighbours
                    y = bam_items.stream().mapToDouble(V -> V.getY()).toArray();
                    for (i = 0; i < bam_items.size(); ++i) {
                        final GCAndDepth gc = bam_items.get(i);
                        int left = i;
                        for (int j = Math.max(0, i - this.smooth_window); j <= i; ++j) {
                            final GCAndDepth gc2 = bam_items.get(j);
                            if (!gc2.withinDistanceOf(gc, this.smoothDistance))
                                continue;
                            left = j;
                            break;
                        }
                        int right = i;
                        for (int j = i; j <= i + this.smooth_window && j < bam_items.size(); ++j) {
                            final GCAndDepth gc2 = bam_items.get(j);
                            if (!gc2.withinDistanceOf(gc, this.smoothDistance))
                                break;
                            right = j;
                        // no break;
                        }
                        gc.norm_depth = this.univariateSmooth.create().evaluate(y, left, (right - left) + 1);
                    }
                    /* print data */
                    for (final GCAndDepth r : bam_items) {
                        pw.print(r.getContig());
                        pw.print('\t');
                        pw.print(r.getStart() - 1);
                        pw.print('\t');
                        pw.print(r.getEnd());
                        pw.print('\t');
                        pw.print(sampleName);
                        pw.print('\t');
                        pw.print(getGenomicIndex(dict, r.getContig(), r.getStart()) - 1);
                        pw.print('\t');
                        pw.printf("%.3f", r.gc);
                        pw.print('\t');
                        pw.printf("%.3f", r.raw_depth);
                        pw.print('\t');
                        pw.printf("%.3f", r.norm_depth);
                        pw.println();
                    }
                    pw.flush();
                }
            // samReader
            }
            // end loop bamPath
            pw.flush();
        }
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(indexedFastaSequenceFile);
    }
}
Also used : Arrays(java.util.Arrays) Precision(org.apache.commons.math3.util.Precision) NevilleInterpolator(org.apache.commons.math3.analysis.interpolation.NevilleInterpolator) Program(com.github.lindenb.jvarkit.util.jcommander.Program) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) UnivariateInterpolator(org.apache.commons.math3.analysis.interpolation.UnivariateInterpolator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) CoverageFactory(com.github.lindenb.jvarkit.samtools.CoverageFactory) StringUtil(htsjdk.samtools.util.StringUtil) Map(java.util.Map) Median(org.apache.commons.math3.stat.descriptive.rank.Median) Path(java.nio.file.Path) MathIllegalArgumentException(org.apache.commons.math3.exception.MathIllegalArgumentException) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) LinearInterpolator(org.apache.commons.math3.analysis.interpolation.LinearInterpolator) LoessInterpolator(org.apache.commons.math3.analysis.interpolation.LoessInterpolator) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) ReferenceSequenceFileFactory(htsjdk.samtools.reference.ReferenceSequenceFileFactory) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) UnivariateFunction(org.apache.commons.math3.analysis.UnivariateFunction) SplineInterpolator(org.apache.commons.math3.analysis.interpolation.SplineInterpolator) SequenceUtil(htsjdk.samtools.util.SequenceUtil) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) OptionalDouble(java.util.OptionalDouble) HtsCollectors(com.github.lindenb.jvarkit.stream.HtsCollectors) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BedLineReader(com.github.lindenb.jvarkit.bed.BedLineReader) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) AbstractUnivariateStatistic(org.apache.commons.math3.stat.descriptive.AbstractUnivariateStatistic) Mean(org.apache.commons.math3.stat.descriptive.moment.Mean) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Locatable(htsjdk.samtools.util.Locatable) GCPercent(com.github.lindenb.jvarkit.util.picard.GenomicSequence.GCPercent) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) DividedDifferenceInterpolator(org.apache.commons.math3.analysis.interpolation.DividedDifferenceInterpolator) Identity(org.apache.commons.math3.analysis.function.Identity) DimensionMismatchException(org.apache.commons.math3.exception.DimensionMismatchException) SamReader(htsjdk.samtools.SamReader) DynamicParameter(com.beust.jcommander.DynamicParameter) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Comparator(java.util.Comparator) Collections(java.util.Collections) ArrayList(java.util.ArrayList) Median(org.apache.commons.math3.stat.descriptive.rank.Median) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) BedLineReader(com.github.lindenb.jvarkit.bed.BedLineReader) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) SamReader(htsjdk.samtools.SamReader) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) UnivariateInterpolator(org.apache.commons.math3.analysis.interpolation.UnivariateInterpolator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) PrintWriter(java.io.PrintWriter) Path(java.nio.file.Path) GCPercent(com.github.lindenb.jvarkit.util.picard.GenomicSequence.GCPercent) CoverageFactory(com.github.lindenb.jvarkit.samtools.CoverageFactory) UnivariateFunction(org.apache.commons.math3.analysis.UnivariateFunction) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) OptionalDouble(java.util.OptionalDouble) SAMFileHeader(htsjdk.samtools.SAMFileHeader) Locatable(htsjdk.samtools.util.Locatable)

Example 60 with SimpleInterval

use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.

the class ReferenceToHtml method doWork.

@Override
public int doWork(final List<String> args) {
    try {
        try (VCFReader reader = VCFReaderFactory.makeDefault().open(Paths.get(oneAndOnlyOneFile(args)), true)) {
            final VCFHeader header = reader.getHeader();
            final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
            final Optional<String> buildName = SequenceDictionaryUtils.getBuildName(dict);
            try (ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx)) {
                SequenceUtil.assertSequenceDictionariesEqual(SequenceDictionaryUtils.extractRequired(reference), dict);
                final List<Locatable> locs = IntervalListProvider.from(this.regionsStr).dictionary(dict).stream().sorted(new ContigDictComparator(dict).createLocatableComparator()).collect(Collectors.toList());
                if (locs.isEmpty()) {
                    LOG.error("no region was defined");
                    return -1;
                }
                final XMLOutputFactory xof = XMLOutputFactory.newFactory();
                try (ArchiveFactory archive = ArchiveFactory.open(archiveOutput)) {
                    try (PrintWriter pw = archive.openWriter(this.prefix + "script.js")) {
                        pw.println(StaticCodeExtractor.forClass(this.getClass()).extract("SCRIPT").get());
                        pw.flush();
                    }
                    try (PrintWriter pw = archive.openWriter(this.prefix + "style.css")) {
                        pw.println(StaticCodeExtractor.forClass(this.getClass()).extract("CSS").get());
                        pw.flush();
                    }
                    final OutputStream index_os = archive.openOuputStream(this.prefix + "index.html");
                    final XMLStreamWriter index_html = xof.createXMLStreamWriter(index_os, "UTF-8");
                    index_html.writeStartDocument("UTF-8", "1.0");
                    index_html.writeStartElement("html");
                    index_html.writeStartElement("head");
                    writeMeta(index_html);
                    index_html.writeStartElement("title");
                    index_html.writeCharacters(this.getProgramName());
                    // title
                    index_html.writeEndElement();
                    index_html.writeStartElement("link");
                    index_html.writeAttribute("rel", "stylesheet");
                    index_html.writeAttribute("href", this.prefix + "style.css");
                    index_html.writeCharacters("");
                    // link
                    index_html.writeEndElement();
                    // head
                    index_html.writeEndElement();
                    index_html.writeStartElement("body");
                    index_html.writeStartElement("ul");
                    for (final Locatable loc : locs) {
                        final List<VariantContext> variants;
                        try (CloseableIterator<VariantContext> iter = reader.query(loc)) {
                            variants = iter.stream().collect(Collectors.toList());
                        }
                        StringWriter sw = new StringWriter();
                        sw.append("var g = ");
                        JsonWriter jsw = new JsonWriter(sw);
                        jsw.beginObject();
                        jsw.name("contig").value(loc.getContig());
                        jsw.name("start").value(loc.getStart());
                        jsw.name("end").value(loc.getEnd());
                        jsw.name("length").value(loc.getLengthOnReference());
                        jsw.name("sequence").value(reference.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getEnd()).getBaseString());
                        jsw.name("variants");
                        jsw.beginArray();
                        for (VariantContext ctx : variants) {
                            jsw.beginObject();
                            jsw.name("start").value(ctx.getStart());
                            jsw.name("end").value(ctx.getEnd());
                            jsw.name("id").value(ctx.hasID() ? ctx.getID() : "");
                            jsw.name("type").value(ctx.getType().name());
                            jsw.name("ref").value(ctx.getReference().getDisplayString());
                            jsw.name("alts");
                            jsw.beginArray();
                            for (Allele alt : ctx.getAlternateAlleles()) {
                                jsw.value(alt.getDisplayString());
                            }
                            jsw.endArray();
                            jsw.endObject();
                        }
                        jsw.endArray();
                        jsw.endObject();
                        jsw.flush();
                        jsw.close();
                        sw.append(";");
                        final String filename = this.prefix + loc.getContig() + "_" + loc.getStart() + "_" + loc.getEnd() + ".html";
                        final String title = (buildName.isPresent() ? buildName.get() + " " : "") + new SimpleInterval(loc).toNiceString();
                        OutputStream os = archive.openOuputStream(filename);
                        XMLStreamWriter out = xof.createXMLStreamWriter(os, "UTF-8");
                        out.writeStartDocument("UTF-8", "1.0");
                        out.writeStartElement("html");
                        out.writeStartElement("head");
                        writeMeta(out);
                        out.writeStartElement("script");
                        out.writeCharacters(sw.toString());
                        // script
                        out.writeEndElement();
                        out.writeStartElement("script");
                        out.writeAttribute("src", this.prefix + "script.js");
                        out.writeCharacters("");
                        // script
                        out.writeEndElement();
                        out.writeStartElement("link");
                        out.writeAttribute("rel", "stylesheet");
                        out.writeAttribute("href", this.prefix + "style.css");
                        out.writeCharacters("");
                        // link
                        out.writeEndElement();
                        out.writeStartElement("title");
                        out.writeCharacters(title + " N=" + StringUtils.niceInt(variants.size()));
                        // title
                        out.writeEndElement();
                        // head
                        out.writeEndElement();
                        out.writeStartElement("body");
                        out.writeStartElement("h1");
                        out.writeCharacters(title + " N=" + StringUtils.niceInt(variants.size()));
                        out.writeEndElement();
                        out.writeStartElement("div");
                        checkBox(out, "showPos", "Line Number");
                        checkBox(out, "showSpace", "Spaces");
                        checkBox(out, "showVar", "Show Variants");
                        out.writeCharacters(" ");
                        out.writeEmptyElement("input");
                        out.writeAttribute("id", "primer");
                        out.writeAttribute("type", "text");
                        out.writeAttribute("placeholder", "Primer Sequence");
                        out.writeStartElement("button");
                        out.writeAttribute("id", "search");
                        out.writeCharacters("Search...");
                        out.writeEndElement();
                        // div
                        out.writeEndElement();
                        out.writeStartElement("div");
                        out.writeAttribute("class", "sequence");
                        out.writeAttribute("id", "main");
                        // div
                        out.writeEndElement();
                        // body
                        out.writeEndElement();
                        // html
                        out.writeEndElement();
                        os.flush();
                        os.close();
                        index_html.writeStartElement("li");
                        index_html.writeStartElement("a");
                        index_html.writeAttribute("href", filename);
                        index_html.writeCharacters(title + " N=" + StringUtils.niceInt(variants.size()));
                        // a
                        index_html.writeEndElement();
                        // li
                        index_html.writeEndElement();
                    }
                    // ul
                    index_html.writeEndElement();
                    // body
                    index_html.writeEndElement();
                    // html
                    index_html.writeEndElement();
                    index_html.writeEndDocument();
                    index_html.close();
                    index_os.flush();
                    index_os.close();
                }
            }
        }
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    }
}
Also used : ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) XMLOutputFactory(javax.xml.stream.XMLOutputFactory) OutputStream(java.io.OutputStream) VariantContext(htsjdk.variant.variantcontext.VariantContext) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) JsonWriter(com.google.gson.stream.JsonWriter) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Allele(htsjdk.variant.variantcontext.Allele) StringWriter(java.io.StringWriter) VCFReader(htsjdk.variant.vcf.VCFReader) XMLStreamWriter(javax.xml.stream.XMLStreamWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) VCFHeader(htsjdk.variant.vcf.VCFHeader) Locatable(htsjdk.samtools.util.Locatable) PrintWriter(java.io.PrintWriter)

Aggregations

SimpleInterval (com.github.lindenb.jvarkit.samtools.util.SimpleInterval)71 ArrayList (java.util.ArrayList)49 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)47 List (java.util.List)47 Locatable (htsjdk.samtools.util.Locatable)46 Path (java.nio.file.Path)46 Parameter (com.beust.jcommander.Parameter)43 Program (com.github.lindenb.jvarkit.util.jcommander.Program)43 Logger (com.github.lindenb.jvarkit.util.log.Logger)43 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)39 Collectors (java.util.stream.Collectors)38 Set (java.util.Set)37 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)36 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)35 ContigNameConverter (com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter)35 SAMFileHeader (htsjdk.samtools.SAMFileHeader)34 CloserUtil (htsjdk.samtools.util.CloserUtil)34 CloseableIterator (htsjdk.samtools.util.CloseableIterator)33 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)32 SamReader (htsjdk.samtools.SamReader)32