Search in sources :

Example 66 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class StarRetroCopy method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.min_depth < 1) {
        LOG.error("Bad min depth");
        return -1;
    }
    PrintWriter saveInsertionsPw = null;
    SamReader sr = null;
    VariantContextWriter vcw0 = null;
    CloseableIterator<SAMRecord> iter = null;
    SAMFileWriter sfw = null;
    try {
        /* load the reference genome */
        /* create a contig name converter from the REF */
        // open the sam file
        final String input = oneFileOrNull(args);
        final SamReaderFactory srf = super.createSamReaderFactory();
        if (this.faidx != null)
            srf.referenceSequence(this.faidx);
        if (input != null) {
            sr = srf.open(SamInputResource.of(input));
            final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(sr.getFileHeader());
            /* READ KNOWGENES FILES */
            loadGTF(dict);
            if (this.use_bai && !sr.hasIndex()) {
                LOG.warning("Cannot used bai because input is not indexed");
            }
            // if there is a bai, only query the bam in the regions of splicing
            if (this.use_bai && sr.hasIndex()) {
                LOG.info("building intervals...");
                final QueryInterval[] intervals = this.intronTreeMap.keySet().stream().flatMap(intron -> {
                    // we need the reads overlapping the exon bounds
                    final int tid = dict.getSequenceIndex(intron.getContig());
                    final int extend = 1 + Math.max(0, this.minCigarSize);
                    final QueryInterval q1 = new QueryInterval(tid, Math.max(1, intron.getStart() - extend), intron.getStart() + extend);
                    final QueryInterval q2 = new QueryInterval(tid, Math.max(1, intron.getEnd() - extend), intron.getEnd() + extend);
                    return Arrays.stream(new QueryInterval[] { q1, q2 });
                }).sorted().collect(HtsCollectors.optimizedQueryIntervals());
                LOG.debug("Query bam using " + intervals.length + " random access intervals. Please wait...");
                iter = sr.queryOverlapping(intervals);
            } else {
                iter = sr.iterator();
            }
        } else {
            sr = srf.open(SamInputResource.of(stdin()));
            final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(sr.getFileHeader());
            /* READ GTF FILES */
            loadGTF(dict);
            iter = sr.iterator();
        }
        final SAMFileHeader samFileHeader = sr.getFileHeader();
        final SAMSequenceDictionary refDict = SequenceDictionaryUtils.extractRequired(samFileHeader);
        /* save gene writer */
        if (this.saveBedPeTo != null) {
            saveInsertionsPw = super.openPathOrStdoutAsPrintWriter(this.saveBedPeTo);
        } else {
            saveInsertionsPw = NullOuputStream.newPrintWriter();
        }
        if (this.saveBamTo != null) {
            sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(samFileHeader, true, this.saveBamTo);
        }
        final String sample = samFileHeader.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse("SAMPLE");
        final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(samFileHeader).logger(LOG).build();
        final String SAM_ATT_JI = "jI";
        while (iter.hasNext()) {
            final SAMRecord rec = progress.apply(iter.next());
            if (rec.getReadUnmappedFlag())
                continue;
            if (rec.getMappingQuality() < this.min_read_mapq)
                continue;
            if (rec.isSecondaryOrSupplementary())
                continue;
            if (rec.getDuplicateReadFlag())
                continue;
            if (rec.getReadFailsVendorQualityCheckFlag())
                continue;
            boolean save_read_to_bam = false;
            /* save read if it is not properly mapped (problem with size) and he and his mate surround an intron */
            if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && rec.getReferenceIndex().equals(rec.getMateReferenceIndex()) && !rec.getProperPairFlag() && /* MUST NOT be proper pair */
            rec.getReadNegativeStrandFlag() != rec.getMateNegativeStrandFlag()) {
                final SimpleInterval intronInterval;
                if (rec.getEnd() + 1 < rec.getMateAlignmentStart()) {
                    intronInterval = new SimpleInterval(rec.getContig(), rec.getEnd() + 1, rec.getMateAlignmentStart() - 1);
                } else if (SAMUtils.hasMateCigar(rec) && SAMUtils.getMateAlignmentEnd(rec) + 1 < rec.getAlignmentStart()) {
                    intronInterval = new SimpleInterval(rec.getContig(), SAMUtils.getMateAlignmentEnd(rec) + 1, rec.getAlignmentStart() - 1);
                } else {
                    intronInterval = null;
                }
                if (intronInterval != null) {
                    if (this.intronTreeMap.getOverlapping(intronInterval).stream().flatMap(L -> L.stream()).anyMatch(S -> intronInterval.contains(S))) {
                        save_read_to_bam = true;
                    }
                }
            }
            /* WE use STAR DATA */
            if (!this.use_cigar_string) {
                if (!rec.hasAttribute(SAM_ATT_JI))
                    continue;
                final Object tagValue = rec.getAttribute(SAM_ATT_JI);
                paranoid.assertTrue((tagValue instanceof int[]));
                final int[] bounds = (int[]) tagValue;
                // jI:B:i,-1
                if (bounds.length == 1 && bounds[0] < 0)
                    continue;
                if (bounds.length % 2 != 0) {
                    LOG.warn("bound.length%2!=0 with " + rec.getSAMString());
                    continue;
                }
                for (int i = 0; i < bounds.length; i += 2) {
                    int intron_start = bounds[i];
                    int intron_end = bounds[i + 1];
                    final Interval r = new Interval(rec.getContig(), intron_start, intron_end);
                    // don't use overlapping : with STAR it is strictly the intron boundaries
                    final List<Segment> introns = this.intronTreeMap.get(r);
                    if (introns == null)
                        continue;
                    save_read_to_bam = true;
                    for (final Segment intron : introns) {
                        intron.match++;
                    }
                }
            } else /* WE use other bam like bwa-mem */
            {
                final Cigar cigar = rec.getCigar();
                if (cigar == null || cigar.numCigarElements() < 2)
                    continue;
                int ref1 = rec.getAlignmentStart();
                for (final CigarElement ce : cigar.getCigarElements()) {
                    final CigarOperator op = ce.getOperator();
                    final int ref_end = ref1 + (op.consumesReferenceBases() ? ce.getLength() : 0);
                    if (op.equals(CigarOperator.N) || op.equals(CigarOperator.D)) {
                        final Interval r = new Interval(rec.getContig(), ref1, ref_end - 1);
                        final List<Segment> introns = this.intronTreeMap.get(r);
                        if (introns == null)
                            continue;
                        save_read_to_bam = true;
                        for (final Segment intron : introns) {
                            intron.match++;
                        }
                    }
                    ref1 = ref_end;
                }
                /**
                 * 2019-07-29. I tried using SA:Z:tag doesn't work well , so let's look a the clipping only
                 */
                if (cigar.isClipped()) {
                    for (int side = 0; side < 2; side++) {
                        final Interval r;
                        if (side == 0 && cigar.isRightClipped() && cigar.getLastCigarElement().getLength() >= this.minCigarSize) {
                            r = new Interval(rec.getContig(), rec.getEnd() + 1, rec.getEnd() + 1 + this.minCigarSize);
                        } else if (side == 1 && cigar.isLeftClipped() && cigar.getFirstCigarElement().getLength() >= this.minCigarSize) {
                            r = new Interval(rec.getContig(), Math.max(1, rec.getStart() - (1 + this.minCigarSize)), Math.max(1, rec.getStart() - (1)));
                        } else {
                            continue;
                        }
                        final int final_side = side;
                        final List<Segment> introns = this.intronTreeMap.getOverlapping(r).stream().flatMap(L -> L.stream()).filter(SEG -> isWithinIntronBound(SEG, r, final_side)).collect(Collectors.toList());
                        if (introns.isEmpty())
                            continue;
                        // System.err.println("SA for "+r+" "+rec.getReadName()+" "+introns.size());
                        save_read_to_bam = true;
                        for (final Segment intron : introns) {
                            intron.match++;
                        }
                    }
                }
            }
            if (save_read_to_bam) {
                saveInsertion(saveInsertionsPw, rec);
                if (sfw != null)
                    sfw.addAlignment(rec);
            }
        }
        final ContigDictComparator contigCmp = new ContigDictComparator(refDict);
        this.all_transcripts.removeIf(T -> T.segments.stream().noneMatch(S -> S.match >= min_depth));
        final int max_introns = this.all_transcripts.stream().mapToInt(K -> K.segments.size()).max().orElse(1);
        final List<String> intron_names = IntStream.range(0, max_introns).mapToObj(IDX -> String.format("%s_INTRON_%04d", sample, 1 + IDX)).collect(Collectors.toList());
        /**
         * build vcf header
         */
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_QUALITY_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY, true));
        metaData.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Variation type"));
        metaData.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Variation Length"));
        for (final String att : ENSEMBL_TRANSCRIPT_ATTS) {
            metaData.add(new VCFInfoHeaderLine(att, 1, VCFHeaderLineType.String, "Value for the attribute '" + att + "' in the gtf"));
        }
        // metaData.add(new VCFFormatHeaderLine(ATT_COUNT_SUPPORTING_READS, 2,VCFHeaderLineType.Integer,"Count supporting reads [intron-left/intron-right]"));
        // metaData.add(new VCFInfoHeaderLine(ATT_RETRO_DESC, VCFHeaderLineCount.UNBOUNDED,VCFHeaderLineType.String,
        // "Retrocopy attributes: transcript-id|strand|exon-left|exon-left-bases|exon-right-bases|exon-right"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns for the Transcript"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns found retrocopied for the transcript"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_FRACTION, 1, VCFHeaderLineType.Float, "Fraction of introns found retrocopied for the transcript"));
        metaData.add(new VCFFormatHeaderLine(INTRON_START, 1, VCFHeaderLineType.Integer, "Introns start"));
        metaData.add(new VCFFormatHeaderLine(INTRON_END, 1, VCFHeaderLineType.Integer, "Introns end"));
        metaData.add(new VCFFilterHeaderLine(ATT_LOW_DEPTH_FILTER + this.low_depth_threshold, "Number of read is lower than :" + this.min_depth));
        metaData.add(new VCFFilterHeaderLine(ATT_NOT_ALL_INTRONS, "Not all introns were found retrocopied"));
        final VCFHeader header = new VCFHeader(metaData, intron_names);
        JVarkitVersion.getInstance().addMetaData(this, header);
        header.setSequenceDictionary(refDict);
        /* open vcf for writing*/
        vcw0 = super.openVariantContextWriter(this.outputFile);
        final VariantContextWriter vcw = vcw0;
        vcw.writeHeader(header);
        Collections.sort(this.all_transcripts, (A, B) -> {
            int i = contigCmp.compare(A.getContig(), B.getContig());
            if (i != 0)
                return i;
            i = Integer.compare(A.getStart(), B.getStart());
            if (i != 0)
                return i;
            return Integer.compare(A.getEnd(), B.getEnd());
        });
        final Allele ref = Allele.create((byte) 'N', true);
        final Allele alt = Allele.create("<RETROCOPY>", false);
        for (final Transcript kg : this.all_transcripts) {
            // ok good candidate
            final VariantContextBuilder vcb = new VariantContextBuilder();
            vcb.chr(kg.getContig());
            vcb.start(kg.getStart());
            vcb.stop(kg.getEnd());
            vcb.id(kg.transcript_id);
            final List<Allele> alleles = Arrays.asList(ref, alt);
            final int max_depth = kg.segments.stream().mapToInt(X -> X.match).max().orElse(0);
            vcb.attribute(VCFConstants.DEPTH_KEY, max_depth);
            vcb.log10PError(max_depth / -10.0);
            boolean filter_set = false;
            if (max_depth < this.low_depth_threshold) {
                vcb.filter(ATT_LOW_DEPTH_FILTER + this.low_depth_threshold);
                filter_set = true;
            }
            vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, 2);
            vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, 1);
            vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, 0.5);
            vcb.attribute(VCFConstants.SVTYPE, "DEL");
            vcb.attribute(VCFConstants.END_KEY, kg.getEnd());
            vcb.attribute("SVLEN", kg.getLengthOnReference());
            for (final String att : kg.attributes.keySet()) {
                vcb.attribute(att, VCFUtils.escapeInfoField(kg.attributes.get(att)));
            }
            vcb.alleles(alleles);
            // introns sequences
            vcb.attribute(ATT_INTRONS_CANDIDATE_COUNT, kg.segments.stream().filter(I -> I.match > 0).count());
            vcb.attribute(ATT_INTRONS_COUNT, kg.segments.size());
            vcb.attribute(ATT_INTRONS_CANDIDATE_FRACTION, kg.segments.stream().filter(I -> I.match > 0).count() / (float) kg.segments.size());
            if (kg.segments.stream().filter(I -> I.match > 0).count() != kg.segments.size()) {
                vcb.filter(ATT_NOT_ALL_INTRONS);
                filter_set = true;
            }
            final List<Genotype> genotypes = new ArrayList<>(kg.segments.size());
            /* build genotypes */
            for (int i = 0; i < kg.segments.size(); i++) {
                final Segment intron = kg.segments.get(i);
                final GenotypeBuilder gb = new GenotypeBuilder(intron_names.get(i), Arrays.asList(ref, alt));
                gb.DP(intron.match);
                gb.attribute(INTRON_START, intron.start);
                gb.attribute(INTRON_END, intron.end);
                genotypes.add(gb.make());
            }
            vcb.genotypes(genotypes);
            if (!filter_set) {
                vcb.passFilters();
            }
            vcw.add(vcb.make());
        }
        progress.close();
        vcw.close();
        iter.close();
        iter = null;
        sr.close();
        sr = null;
        saveInsertionsPw.flush();
        saveInsertionsPw.close();
        saveInsertionsPw = null;
        if (sfw != null) {
            sfw.close();
            sfw = null;
        }
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(sr);
        CloserUtil.close(vcw0);
        CloserUtil.close(sfw);
        CloserUtil.close(saveInsertionsPw);
    }
}
Also used : Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) Map(java.util.Map) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Predicate(java.util.function.Predicate) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) SAMFileWriter(htsjdk.samtools.SAMFileWriter) Collectors(java.util.stream.Collectors) GTFCodec(com.github.lindenb.jvarkit.util.bio.gtf.GTFCodec) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) SamReaderFactory(htsjdk.samtools.SamReaderFactory) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) IntStream(java.util.stream.IntStream) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) SAMUtils(htsjdk.samtools.SAMUtils) Parameter(com.beust.jcommander.Parameter) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) HtsCollectors(com.github.lindenb.jvarkit.stream.HtsCollectors) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Interval(htsjdk.samtools.util.Interval) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFConstants(htsjdk.variant.vcf.VCFConstants) Locatable(htsjdk.samtools.util.Locatable) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) SAMFileWriterFactory(htsjdk.samtools.SAMFileWriterFactory) Files(java.nio.file.Files) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) SamReader(htsjdk.samtools.SamReader) File(java.io.File) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) SamInputResource(htsjdk.samtools.SamInputResource) QueryInterval(htsjdk.samtools.QueryInterval) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Paranoid(com.github.lindenb.jvarkit.lang.Paranoid) BufferedReader(java.io.BufferedReader) Collections(java.util.Collections) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) ArrayList(java.util.ArrayList) QueryInterval(htsjdk.samtools.QueryInterval) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) SamReader(htsjdk.samtools.SamReader) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) PrintWriter(java.io.PrintWriter) HashSet(java.util.HashSet) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) SamReaderFactory(htsjdk.samtools.SamReaderFactory) SAMFileWriter(htsjdk.samtools.SAMFileWriter) SAMFileWriterFactory(htsjdk.samtools.SAMFileWriterFactory) Genotype(htsjdk.variant.variantcontext.Genotype) CigarOperator(htsjdk.samtools.CigarOperator) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) CigarElement(htsjdk.samtools.CigarElement) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOException(java.io.IOException) Allele(htsjdk.variant.variantcontext.Allele) Cigar(htsjdk.samtools.Cigar) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) Interval(htsjdk.samtools.util.Interval) QueryInterval(htsjdk.samtools.QueryInterval)

Example 67 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class FindNewSpliceSites method doWork.

@Override
public int doWork(final List<String> args) {
    SamReader sfr = null;
    PrintWriter bedWriter = null;
    SortingCollection<Junction> junctionSorter = null;
    try {
        final SamReaderFactory srf = super.createSamReaderFactory();
        if (this.faidx != null) {
            srf.referenceSequence(this.faidx);
        }
        final String input = oneFileOrNull(args);
        sfr = input == null ? srf.open(SamInputResource.of(stdin())) : srf.open(SamInputResource.of(input));
        final SAMFileHeader header0 = sfr.getFileHeader();
        try (GtfReader gftReader = new GtfReader(this.gtfPath)) {
            SAMSequenceDictionary dict = header0.getSequenceDictionary();
            if (dict != null)
                gftReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
            gftReader.getAllGenes().stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.getExonCount() > 1).flatMap(T -> T.getIntrons().stream()).map(T -> T.toInterval()).forEach(T -> {
                this.intronMap.put(T, T);
            });
        }
        final SAMFileHeader header1 = header0.clone();
        final SAMProgramRecord p = header1.createProgramRecord();
        p.setCommandLine(getProgramCommandLine());
        p.setProgramVersion(getVersion());
        p.setProgramName(getProgramName());
        this.sfw = this.writingBamArgs.openSamWriter(outputFile, header1, true);
        final SAMFileHeader header2 = header0.clone();
        final SAMProgramRecord p2 = header2.createProgramRecord();
        p2.setCommandLine(getProgramCommandLine());
        p2.setProgramVersion(getVersion());
        p2.setProgramName(getProgramName());
        this.weird = this.writingBamArgs.createSAMFileWriterFactory().makeSAMWriter(header2, true, new NullOuputStream());
        if (this.bedOut != null) {
            final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(sfr.getFileHeader());
            this.junctionComparator = new ContigDictComparator(dict).createLocatableComparator();
            junctionSorter = SortingCollection.newInstance(Junction.class, new JunctionCodec(), (A, B) -> A.compare2(B), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        }
        scan(sfr, p, p2, junctionSorter);
        sfr.close();
        if (this.bedOut != null) {
            junctionSorter.doneAdding();
            bedWriter = super.openPathOrStdoutAsPrintWriter(this.bedOut);
            final String sample = StringUtils.ifBlank(header0.getReadGroups().stream().map(RG -> RG.getSample()).filter(s -> !StringUtils.isBlank(s)).collect(Collectors.toCollection(TreeSet::new)).stream().collect(Collectors.joining(";")), ".");
            try (CloseableIterator<Junction> iter = junctionSorter.iterator()) {
                final EqualRangeIterator<Junction> eq = new EqualRangeIterator<>(iter, (A, B) -> A.compare1(B));
                while (eq.hasNext()) {
                    final List<Junction> row = eq.next();
                    final Junction first = row.get(0);
                    bedWriter.print(first.getContig());
                    bedWriter.print('\t');
                    bedWriter.print(first.getStart() - 1);
                    bedWriter.print('\t');
                    bedWriter.print(first.getEnd());
                    bedWriter.print('\t');
                    bedWriter.print(sample);
                    bedWriter.print('\t');
                    bedWriter.print(first.name);
                    bedWriter.print('\t');
                    bedWriter.print(row.size());
                    bedWriter.println();
                }
                eq.close();
            }
            bedWriter.flush();
            bedWriter.close();
            bedWriter = null;
            junctionSorter.cleanup();
        }
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(sfr);
        CloserUtil.close(this.sfw);
        CloserUtil.close(this.weird);
        CloserUtil.close(bedWriter);
    }
}
Also used : DataInputStream(java.io.DataInputStream) Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) TreeSet(java.util.TreeSet) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Interval(htsjdk.samtools.util.Interval) DataOutputStream(java.io.DataOutputStream) AbstractDataCodec(com.github.lindenb.jvarkit.util.picard.AbstractDataCodec) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) Locatable(htsjdk.samtools.util.Locatable) SortingCollection(htsjdk.samtools.util.SortingCollection) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) Collection(java.util.Collection) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) IOException(java.io.IOException) SAMFileWriter(htsjdk.samtools.SAMFileWriter) SamReader(htsjdk.samtools.SamReader) EOFException(java.io.EOFException) Collectors(java.util.stream.Collectors) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) SamInputResource(htsjdk.samtools.SamInputResource) SAMProgramRecord(htsjdk.samtools.SAMProgramRecord) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) Comparator(java.util.Comparator) SamReaderFactory(htsjdk.samtools.SamReaderFactory) SamReaderFactory(htsjdk.samtools.SamReaderFactory) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) SAMProgramRecord(htsjdk.samtools.SAMProgramRecord) IOException(java.io.IOException) EOFException(java.io.EOFException) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) SamReader(htsjdk.samtools.SamReader) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) TreeSet(java.util.TreeSet) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) SAMFileHeader(htsjdk.samtools.SAMFileHeader) PrintWriter(java.io.PrintWriter)

Example 68 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class RNASeqPolyA method doWork.

@Override
public int doWork(final List<String> args) {
    final Map<String, GeneInfo> geneToTrancripts = new HashMap<>();
    try {
        final String debugTranscript = dynaParams.getOrDefault("debug.transcript", "");
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.faidx);
        if (!StringUtils.isBlank(this.limit_contig) && dict.getSequence(this.limit_contig) == null) {
            throw new JvarkitException.ContigNotFoundInDictionary(this.limit_contig, dict);
        }
        final ContigNameConverter converter = ContigNameConverter.fromOneDictionary(dict);
        final Gff3Codec gff3 = new Gff3Codec(Gff3Codec.DecodeDepth.DEEP);
        try (InputStream is = IOUtils.openPathForReading(this.gffPath)) {
            final AsciiLineReader asciiLineReader = AsciiLineReader.from(is);
            final LineIterator lr = new LineIteratorImpl(asciiLineReader);
            while (!gff3.isDone(lr)) {
                decodeGff3Feature(gff3.decode(lr), converter, geneToTrancripts);
            }
            gff3.close(lr);
            asciiLineReader.close();
        }
        if (geneToTrancripts.isEmpty()) {
            LOG.warn("no transcript was found.");
        // continue , empty VCF must be produced
        }
        final IntervalTreeMap<LastExon> exonMap = new IntervalTreeMap<>();
        // fill exonMap
        geneToTrancripts.values().stream().flatMap(K -> K.transcripts.values().stream()).forEach(X -> exonMap.put(new Interval(X), X));
        final ToIntFunction<String> toTid = C -> {
            final SAMSequenceRecord ssr = dict.getSequence(C);
            if (ssr == null)
                throw new JvarkitException.ContigNotFoundInDictionary(C, dict);
            return ssr.getSequenceIndex();
        };
        final List<Path> inputs = IOUtils.unrollPaths(args);
        final QueryInterval[] intervals = this.disable_bam_index || inputs.isEmpty() ? null : QueryInterval.optimizeIntervals(exonMap.values().stream().map(R -> new QueryInterval(toTid.applyAsInt(R.getContig()), R.getPosition(), R.getPosition())).toArray(N -> new QueryInterval[N]));
        final SamReaderFactory srf = super.createSamReaderFactory().referenceSequence(this.faidx);
        final String primerAAA;
        if (polyA_primer_size > 0) {
            primerAAA = StringUtils.repeat(this.polyA_primer_size, 'A');
        } else {
            primerAAA = null;
        }
        final Set<String> samples = new HashSet<>();
        int bam_index = 0;
        // loop over the bams
        for (; ; ) {
            final Path bamFilename = inputs.isEmpty() ? null : inputs.get(bam_index);
            try (SamReader sr = inputs.isEmpty() ? srf.open(SamInputResource.of(stdin())) : srf.open(bamFilename)) {
                final SAMFileHeader header0 = sr.getFileHeader();
                SequenceUtil.assertSequenceDictionariesEqual(dict, SequenceDictionaryUtils.extractRequired(header0));
                final String sample = header0.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse(bamFilename == null ? "STDIN" : IOUtils.getFilenameWithoutCommonSuffixes(bamFilename));
                if (samples.contains(sample)) {
                    LOG.error("duplicate sample " + sample);
                    return -1;
                }
                samples.add(sample);
                final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(dict).build();
                try (CloseableIterator<SAMRecord> iter = (intervals == null || inputs.isEmpty() ? /*stdin*/
                sr.iterator() : sr.query(intervals, false))) {
                    while (iter.hasNext()) {
                        final SAMRecord rec = progress.apply(iter.next());
                        if (rec.getReadUnmappedFlag())
                            continue;
                        if (!StringUtils.isBlank(this.limit_contig) && !rec.getContig().equals(this.limit_contig))
                            continue;
                        if (this.default_read_filter && !SAMRecordDefaultFilter.accept(rec))
                            continue;
                        final Collection<LastExon> lastExons = exonMap.getOverlapping(rec);
                        if (lastExons.isEmpty())
                            continue;
                        final Cigar cigar = rec.getCigar();
                        if (cigar == null || cigar.isEmpty())
                            continue;
                        final byte[] bases = rec.getReadBases();
                        if (bases == null || SAMRecord.NULL_SEQUENCE.equals(bases))
                            continue;
                        for (LastExon exon : lastExons) {
                            if (!StringUtils.isBlank(debugTranscript) && !exon.transcriptId.equals(debugTranscript)) {
                                continue;
                            }
                            ExonCount count = exon.sample2count.get(sample);
                            if (count == null) {
                                count = new ExonCount();
                                exon.sample2count.put(sample, count);
                            }
                            final StringBuilder sb = new StringBuilder();
                            boolean indel_flag = false;
                            boolean last_exon_in_intron_flag = false;
                            boolean match_last_base = false;
                            int ref1 = rec.getUnclippedStart();
                            int read0 = 0;
                            for (CigarElement ce : cigar) {
                                if (exon.isMinusStrand() && ref1 > exon.start)
                                    break;
                                if (this.ignore_with_indels && indel_flag)
                                    break;
                                final CigarOperator op = ce.getOperator();
                                switch(op) {
                                    case P:
                                        break;
                                    case I:
                                        indel_flag = true;
                                        for (int i = 0; i < ce.getLength(); i++) {
                                            if (exon.isAfterExon(ref1)) {
                                                sb.append((char) Character.toUpperCase(bases[read0]));
                                            }
                                            read0++;
                                        }
                                        break;
                                    case D:
                                    case N:
                                        if ((exon.isPlusStrand() && CoordMath.overlaps(ref1, ref1 + ce.getLength() - 1, exon.getEnd(), exon.getEnd() + 1)) || (exon.isMinusStrand() && CoordMath.overlaps(ref1, ref1 + ce.getLength() - 1, exon.getStart() - 1, exon.getStart()))) {
                                            last_exon_in_intron_flag = true;
                                        }
                                        ref1 += ce.getLength();
                                        indel_flag = true;
                                        break;
                                    case H:
                                        for (int i = 0; i < ce.getLength(); i++) {
                                            if (exon.isAfterExon(ref1)) {
                                                sb.append('N');
                                            }
                                            if (ref1 == exon.getPosition())
                                                match_last_base = true;
                                            ref1++;
                                        }
                                        break;
                                    case S:
                                    case M:
                                    case X:
                                    case EQ:
                                        for (int i = 0; i < ce.getLength(); i++) {
                                            if (exon.isAfterExon(ref1)) {
                                                sb.append((char) Character.toUpperCase(bases[read0]));
                                            }
                                            if (ref1 == exon.getPosition())
                                                match_last_base = true;
                                            read0++;
                                            ref1++;
                                        }
                                        break;
                                    default:
                                        throw new IllegalStateException(op.name());
                                }
                            }
                            // premature end or start
                            if (!match_last_base || (exon.isPlusStrand() && ref1 < exon.getEnd()) || (exon.isMinusStrand() && ref1 < exon.getStart()) || (this.ignore_with_indels && indel_flag) || last_exon_in_intron_flag) {
                                continue;
                            }
                            // if(read0!=bases.length) throw new IllegalStateException("read0:"+read0+" expected "+bases.length+" in "+rec.getReadName());
                            // if(ref1!=1+rec.getUnclippedEnd())throw new IllegalStateException("ref1:"+ref1+" expected 1+"+rec.getUnclippedEnd()+" in "+rec.getReadName());
                            ++count.n_tested_reads;
                            String polyA;
                            if (exon.isMinusStrand()) {
                                polyA = AcidNucleics.reverseComplement(sb);
                            } else {
                                polyA = sb.toString();
                            }
                            if (primerAAA != null) {
                                final int pos = polyA.indexOf(primerAAA);
                                if (pos > 0)
                                    polyA = polyA.substring(pos);
                            }
                            int count_polyA = 0;
                            for (int i = 0; i < polyA.length(); i++) {
                                if (polyA.charAt(i) != 'A')
                                    break;
                                count_polyA++;
                            }
                            if (count_polyA > 0) {
                                count.n_tested_reads_with_A++;
                                count.sum_polyA += count_polyA;
                            }
                            if (count_polyA > count.max_length_polyA) {
                                count.max_length_polyA = count_polyA;
                            }
                        }
                    // end of loop last exon
                    }
                    progress.close();
                }
            }
            ++bam_index;
            if (inputs.isEmpty() || bam_index >= inputs.size())
                break;
        }
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        final VCFInfoHeaderLine infoGeneId = new VCFInfoHeaderLine("GENE", 1, VCFHeaderLineType.String, "Gene ID in " + this.gffPath);
        metaData.add(infoGeneId);
        final VCFInfoHeaderLine infoTranscriptId = new VCFInfoHeaderLine("TRANSCRIPT", 1, VCFHeaderLineType.String, "Transcript ID in " + this.gffPath);
        metaData.add(infoTranscriptId);
        final VCFInfoHeaderLine infoStrand = new VCFInfoHeaderLine("STRAND", 1, VCFHeaderLineType.String, "Strand");
        metaData.add(infoStrand);
        final VCFInfoHeaderLine infoTranscriptMaxPolyA = new VCFInfoHeaderLine("TRANSCRIPT_MAX", 1, VCFHeaderLineType.Integer, "Max poly A in Transcript");
        metaData.add(infoTranscriptMaxPolyA);
        final VCFInfoHeaderLine infoGeneMaxPolyA = new VCFInfoHeaderLine("GENE_MAX", 1, VCFHeaderLineType.Integer, "Max poly A in Gene");
        metaData.add(infoGeneMaxPolyA);
        final VCFInfoHeaderLine infoEndPos = new VCFInfoHeaderLine("POS3", 1, VCFHeaderLineType.Integer, "End 3 prime position");
        metaData.add(infoEndPos);
        final VCFInfoHeaderLine infoGeneName = new VCFInfoHeaderLine("GENE_NAME", 1, VCFHeaderLineType.String, "Gene Name");
        metaData.add(infoGeneName);
        final VCFInfoHeaderLine infoBiotype = new VCFInfoHeaderLine("GENE_BIOTYPE", 1, VCFHeaderLineType.String, "Gene Biotype");
        metaData.add(infoBiotype);
        final int n_last_exon_bases = Math.max(0, Integer.parseInt(dynaParams.getOrDefault("last.n.exons", "10")));
        final VCFInfoHeaderLine infoLastExonBases = new VCFInfoHeaderLine("LAST_BASES", 1, VCFHeaderLineType.String, "Last exon bases N=" + n_last_exon_bases + ". Reverse-complemented for negative strand.");
        metaData.add(infoLastExonBases);
        final int n_after_exon_bases = Math.max(0, Integer.parseInt(dynaParams.getOrDefault("after.n.exons", "10")));
        final VCFInfoHeaderLine infoAfterExonBases = new VCFInfoHeaderLine("AFTER_BASES", 1, VCFHeaderLineType.String, "Bases after exon N=" + n_after_exon_bases + ". Reverse-complemented for negative strand.");
        metaData.add(infoAfterExonBases);
        final VCFInfoHeaderLine infoOtherIdss = new VCFInfoHeaderLine("OTHER_IDS", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Other transcripts ending at the same coordinate.");
        metaData.add(infoOtherIdss);
        final VCFFormatHeaderLine fmtMaxPolyA = new VCFFormatHeaderLine("MAX", 1, VCFHeaderLineType.Integer, "Max poly A");
        metaData.add(fmtMaxPolyA);
        final VCFFormatHeaderLine fmtReadPolyA = new VCFFormatHeaderLine("DPA", 1, VCFHeaderLineType.Integer, "Read with at least one A");
        metaData.add(fmtReadPolyA);
        final VCFFormatHeaderLine fmtAveragePolyA = new VCFFormatHeaderLine("AVG", 1, VCFHeaderLineType.Float, "average length of poly-A for reads carrying at least one A.");
        metaData.add(fmtAveragePolyA);
        VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.DEPTH_KEY, VCFConstants.END_KEY);
        VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.DEPTH_KEY);
        final VCFHeader header = new VCFHeader(metaData, samples.stream().sorted().collect(Collectors.toList()));
        header.setSequenceDictionary(dict);
        JVarkitVersion.getInstance().addMetaData(this, header);
        final UnaryOperator<String> afterColon = S -> {
            if (!(S.startsWith("gene:") || S.startsWith("transcript:")))
                return S;
            int colon = S.indexOf(":");
            return S.substring(colon + 1);
        };
        final List<Allele> ALLELES = Collections.singletonList(Allele.create("N", true));
        try (VariantContextWriter w = writingVariantsDelegate.dictionary(dict).open(this.outputFile);
            ReferenceSequenceFile fai = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx)) {
            w.writeHeader(header);
            exonMap.values().stream().sorted(new ContigDictComparator(dict).createLocatableComparator()).forEach(T -> {
                if (T.getDP() == 0)
                    return;
                final String lastBases;
                final String afterBases;
                if (T.isPlusStrand()) {
                    lastBases = fai.getSubsequenceAt(T.getContig(), Math.max(T.getStart(), T.getEnd() - n_last_exon_bases), T.getEnd()).getBaseString();
                    final SAMSequenceRecord ssr = Objects.requireNonNull(dict.getSequence(T.getContig()));
                    afterBases = fai.getSubsequenceAt(T.getContig(), T.getEnd() + 1, Math.min(T.getEnd() + n_after_exon_bases, ssr.getLengthOnReference())).getBaseString();
                } else if (T.isMinusStrand()) {
                    lastBases = AcidNucleics.reverseComplement(fai.getSubsequenceAt(T.getContig(), T.getStart(), Math.min(T.getEnd(), T.getStart() + n_last_exon_bases)).getBaseString());
                    afterBases = AcidNucleics.reverseComplement(fai.getSubsequenceAt(T.getContig(), Math.max(1, T.getStart() - n_after_exon_bases), T.getStart() - 1).getBaseString());
                } else {
                    lastBases = null;
                    afterBases = null;
                }
                final VariantContextBuilder vcb = new VariantContextBuilder();
                vcb.chr(T.getContig());
                vcb.start(T.getStart());
                vcb.stop(T.getEnd());
                vcb.id(afterColon.apply(T.transcriptId));
                vcb.attribute(VCFConstants.END_KEY, T.getEnd());
                vcb.attribute(infoGeneId.getID(), afterColon.apply(T.gene.geneId));
                vcb.attribute(infoTranscriptId.getID(), afterColon.apply(T.transcriptId));
                vcb.attribute(infoStrand.getID(), T.strand.name());
                vcb.attribute(infoEndPos.getID(), T.getPosition());
                if (T.otherIds != null && !T.otherIds.isEmpty()) {
                    vcb.attribute(infoOtherIdss.getID(), T.otherIds.stream().map(afterColon).collect(Collectors.toList()));
                }
                if (!StringUtils.isBlank(lastBases)) {
                    vcb.attribute(infoLastExonBases.getID(), lastBases);
                }
                if (!StringUtils.isBlank(afterBases)) {
                    vcb.attribute(infoAfterExonBases.getID(), afterBases);
                }
                if (!StringUtils.isBlank(T.gene.geneName)) {
                    vcb.attribute(infoGeneName.getID(), T.gene.geneName);
                }
                if (!StringUtils.isBlank(T.gene.biotype)) {
                    vcb.attribute(infoBiotype.getID(), T.gene.biotype);
                }
                final List<Genotype> genotypes = new ArrayList<>(samples.size());
                for (String sn : samples) {
                    final ExonCount count = T.sample2count.get(sn);
                    final GenotypeBuilder gb = new GenotypeBuilder(sn);
                    gb.attribute(fmtMaxPolyA.getID(), count == null ? 0 : count.max_length_polyA);
                    gb.attribute(fmtReadPolyA.getID(), count == null ? 0 : count.n_tested_reads_with_A);
                    gb.attribute(fmtAveragePolyA.getID(), count == null || count.n_tested_reads_with_A == 0 ? 0f : count.sum_polyA / (float) count.n_tested_reads_with_A);
                    gb.DP(count == null ? 0 : count.n_tested_reads);
                    genotypes.add(gb.make());
                }
                vcb.alleles(ALLELES);
                vcb.genotypes(genotypes);
                vcb.attribute(VCFConstants.DEPTH_KEY, T.getDP());
                final int score = T.getMaxPolyA();
                if (score > 0)
                    vcb.log10PError(score / -10.0);
                vcb.attribute(infoTranscriptMaxPolyA.getID(), score);
                vcb.attribute(infoGeneMaxPolyA.getID(), T.gene.getmaxPolyA());
                w.add(vcb.make());
            });
        }
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
    }
}
Also used : WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) AsciiLineReader(htsjdk.tribble.readers.AsciiLineReader) Allele(htsjdk.variant.variantcontext.Allele) Program(com.github.lindenb.jvarkit.util.jcommander.Program) LineIterator(htsjdk.tribble.readers.LineIterator) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) BiFunction(java.util.function.BiFunction) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) UnaryOperator(java.util.function.UnaryOperator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) Map(java.util.Map) Path(java.nio.file.Path) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Collection(java.util.Collection) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) SAMRecord(htsjdk.samtools.SAMRecord) LineIteratorImpl(htsjdk.tribble.readers.LineIteratorImpl) Objects(java.util.Objects) ReferenceSequenceFileFactory(htsjdk.samtools.reference.ReferenceSequenceFileFactory) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) CoordMath(htsjdk.samtools.util.CoordMath) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) SamReaderFactory(htsjdk.samtools.SamReaderFactory) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) SequenceUtil(htsjdk.samtools.util.SequenceUtil) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) AcidNucleics(com.github.lindenb.jvarkit.util.bio.AcidNucleics) HashMap(java.util.HashMap) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Interval(htsjdk.samtools.util.Interval) Strand(htsjdk.tribble.annotation.Strand) SAMRecordDefaultFilter(com.github.lindenb.jvarkit.samtools.SAMRecordDefaultFilter) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFConstants(htsjdk.variant.vcf.VCFConstants) Gff3Feature(htsjdk.tribble.gff.Gff3Feature) Locatable(htsjdk.samtools.util.Locatable) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ToIntFunction(java.util.function.ToIntFunction) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) SamReader(htsjdk.samtools.SamReader) SamInputResource(htsjdk.samtools.SamInputResource) Gff3Codec(htsjdk.tribble.gff.Gff3Codec) QueryInterval(htsjdk.samtools.QueryInterval) DynamicParameter(com.beust.jcommander.DynamicParameter) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Collections(java.util.Collections) InputStream(java.io.InputStream) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) HashMap(java.util.HashMap) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) ArrayList(java.util.ArrayList) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) LineIteratorImpl(htsjdk.tribble.readers.LineIteratorImpl) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) HashSet(java.util.HashSet) AsciiLineReader(htsjdk.tribble.readers.AsciiLineReader) Genotype(htsjdk.variant.variantcontext.Genotype) CigarOperator(htsjdk.samtools.CigarOperator) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) CigarElement(htsjdk.samtools.CigarElement) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) Interval(htsjdk.samtools.util.Interval) QueryInterval(htsjdk.samtools.QueryInterval) QueryInterval(htsjdk.samtools.QueryInterval) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) LineIterator(htsjdk.tribble.readers.LineIterator) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) SamReader(htsjdk.samtools.SamReader) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Path(java.nio.file.Path) SamReaderFactory(htsjdk.samtools.SamReaderFactory) InputStream(java.io.InputStream) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) Gff3Codec(htsjdk.tribble.gff.Gff3Codec) Cigar(htsjdk.samtools.Cigar) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap)

Example 69 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class CnvSlidingWindow method doWork.

@Override
public int doWork(final List<String> args) {
    final List<Sample> samples = new ArrayList<>();
    try {
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.refPath);
        final DistanceParser distParser = new DistanceParser();
        final int[] windows_array = Arrays.stream(CharSplitter.SEMICOLON.split(windowDefs)).filter(S -> !StringUtils.isBlank(S)).mapToInt(N -> distParser.applyAsInt(N)).toArray();
        if (windows_array.length == 0) {
            LOG.error("No window defined.");
            return -1;
        }
        if (windows_array.length % 2 != 0) {
            LOG.error("odd number of windows ? " + this.windowDefs);
            return -1;
        }
        final List<Path> inputBams = IOUtils.unrollPaths(args);
        if (inputBams.isEmpty()) {
            LOG.error("input bam file missing.");
            return -1;
        }
        final Set<String> sampleNames = new TreeSet<>();
        for (final Path samFile : inputBams) {
            final Sample sample = new Sample(samFile);
            if (sampleNames.contains(sample.name)) {
                LOG.error("duplicate sample " + sample.name);
                sample.close();
                return -1;
            }
            sampleNames.add(sample.name);
            samples.add(sample);
            SequenceUtil.assertSequenceDictionariesEqual(dict, sample.dict);
        }
        final List<Locatable> contigs = dict.getSequences().stream().filter(SR -> !SR.getSequenceName().matches(this.excludeRegex)).collect(Collectors.toCollection(ArrayList::new));
        if (this.excludeBedPath != null) {
            final BedLineCodec bedLineCodec = new BedLineCodec();
            final ContigNameConverter ctgConverter = ContigNameConverter.fromOneDictionary(dict);
            try (BufferedReader br = IOUtils.openPathForBufferedReading(this.excludeBedPath)) {
                final List<SimpleInterval> exclude = br.lines().filter(L -> !BedLine.isBedHeader(L)).map(L -> bedLineCodec.decode(L)).filter(B -> B != null && !StringUtils.isBlank(ctgConverter.apply(B.getContig()))).map(B -> new SimpleInterval(ctgConverter.apply(B.getContig()), B.getStart(), B.getEnd())).collect(Collectors.toList());
                boolean done = false;
                while (!done) {
                    done = true;
                    int i = 0;
                    while (i < contigs.size()) {
                        final Locatable contig = contigs.get(i);
                        final Locatable overlapper = exclude.stream().filter(EX -> EX.overlaps(contig)).findAny().orElse(null);
                        if (overlapper != null) {
                            contigs.remove(i);
                            contigs.addAll(split(contig, overlapper));
                            done = false;
                        } else {
                            i++;
                        }
                    }
                }
            }
        }
        contigs.sort(new ContigDictComparator(dict).createLocatableComparator());
        final Allele ref_allele = Allele.create("N", true);
        final Allele dup_allele = Allele.create("<DUP>", false);
        final Allele del_allele = Allele.create("<DEL>", false);
        final Function<Integer, List<Allele>> cnv2allele = CNV -> {
            switch(CNV) {
                case 0:
                    return Arrays.asList(ref_allele, ref_allele);
                case 1:
                    return Arrays.asList(ref_allele, dup_allele);
                case 2:
                    return Arrays.asList(dup_allele, dup_allele);
                case -1:
                    return Arrays.asList(ref_allele, del_allele);
                case -2:
                    return Arrays.asList(del_allele, del_allele);
                default:
                    throw new IllegalArgumentException("cnv:" + CNV);
            }
        };
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_KEY);
        VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.END_KEY);
        final VCFFormatHeaderLine fmtLeftCov = new VCFFormatHeaderLine("LC", 1, VCFHeaderLineType.Float, "Left median coverage.");
        final VCFFormatHeaderLine fmtMidCov = new VCFFormatHeaderLine("MC", 1, VCFHeaderLineType.Float, "Middle median coverage.");
        final VCFFormatHeaderLine fmtRightCov = new VCFFormatHeaderLine("RC", 1, VCFHeaderLineType.Float, "right median coverage.");
        final VCFFormatHeaderLine fmtLeftMedian = new VCFFormatHeaderLine("LM", 1, VCFHeaderLineType.Float, "Left normalized median coverage.");
        final VCFFormatHeaderLine fmtMidMedian = new VCFFormatHeaderLine("MM", 1, VCFHeaderLineType.Float, "Middle normalized median coverage.");
        final VCFFormatHeaderLine fmtRightMedian = new VCFFormatHeaderLine("RM", 1, VCFHeaderLineType.Float, "right normalized median coverage.");
        metaData.add(fmtLeftCov);
        metaData.add(fmtMidCov);
        metaData.add(fmtRightCov);
        metaData.add(fmtLeftMedian);
        metaData.add(fmtMidMedian);
        metaData.add(fmtRightMedian);
        final VCFHeader header = new VCFHeader(metaData, sampleNames);
        header.setSequenceDictionary(dict);
        JVarkitVersion.getInstance().addMetaData(this, header);
        final VariantContextWriter vcw = this.writingVariantsDelegate.dictionary(dict).open(this.outputFile);
        vcw.writeHeader(header);
        for (final Locatable contig : contigs) {
            System.gc();
            final short[] array = new short[contig.getLengthOnReference()];
            final SortingCollection<Gt> sorter = SortingCollection.newInstance(Gt.class, new GtCodec(), (A, B) -> A.compare1(B), this.sorting.getMaxRecordsInRam(), this.sorting.getTmpPaths());
            for (int bam_index = 0; bam_index < samples.size(); bam_index++) {
                final Sample sampleBam = samples.get(bam_index);
                Arrays.fill(array, (short) 0);
                try (SAMRecordIterator iter = sampleBam.samReader.queryOverlapping(contig.getContig(), contig.getStart(), contig.getEnd())) {
                    while (iter.hasNext()) {
                        final SAMRecord rec = iter.next();
                        if (rec.getReadUnmappedFlag())
                            continue;
                        if (rec.getDuplicateReadFlag())
                            continue;
                        if (rec.isSecondaryOrSupplementary())
                            continue;
                        if (rec.getReadFailsVendorQualityCheckFlag())
                            continue;
                        final Cigar cigar = rec.getCigar();
                        if (cigar == null || cigar.isEmpty())
                            continue;
                        int refPos = rec.getStart();
                        for (final CigarElement ce : cigar) {
                            final CigarOperator op = ce.getOperator();
                            if (op.consumesReferenceBases()) {
                                if (op.consumesReadBases()) {
                                    for (int i = 0; i < ce.getLength(); i++) {
                                        final int idx = refPos - contig.getStart() + i;
                                        if (idx < 0)
                                            continue;
                                        if (idx >= array.length)
                                            break;
                                        if (array[idx] == Short.MAX_VALUE)
                                            continue;
                                        array[idx]++;
                                    }
                                }
                                refPos += ce.getLength();
                            }
                        }
                    }
                }
                for (int widx = 0; widx < windows_array.length; widx += 2) {
                    final int window_size = windows_array[widx + 0];
                    final int extend = (int) Math.ceil(window_size * this.extend);
                    if (extend <= 0)
                        continue;
                    if (window_size > contig.getLengthOnReference())
                        continue;
                    final int window_shift = windows_array[widx + 1];
                    LOG.info(contig + " " + window_size + "+-" + extend + ";" + window_shift + " " + sampleBam.name);
                    final Coverage leftcov = new Coverage(extend);
                    final Coverage rightcov = new Coverage(extend);
                    final Coverage leftrightcov = new Coverage(extend + extend);
                    final Coverage midcov = new Coverage(window_size);
                    for (int pos1 = contig.getStart(); pos1 + window_size + extend + extend < contig.getEnd(); pos1 += window_shift) {
                        leftcov.reset();
                        rightcov.reset();
                        leftrightcov.reset();
                        midcov.reset();
                        for (int x = 0; x < extend; x++) {
                            final int idx = pos1 - contig.getStart() + x;
                            leftcov.add(array[idx]);
                            leftrightcov.add(array[idx]);
                        }
                        final double leftMedian = leftcov.median();
                        if (leftMedian < this.min_depth)
                            continue;
                        for (int x = 0; x < extend; x++) {
                            final int idx = pos1 - contig.getStart() + extend + window_size + x;
                            rightcov.add(array[idx]);
                            leftrightcov.add(array[idx]);
                        }
                        final double rightMedian = rightcov.median();
                        if (rightMedian < this.min_depth)
                            continue;
                        for (int x = 0; x < window_size; x++) {
                            final int idx = pos1 - contig.getStart() + extend + x;
                            midcov.add(array[idx]);
                        }
                        final double median = leftrightcov.median();
                        if (rightcov.median() < this.min_depth)
                            continue;
                        for (int x = 0; x < midcov.count; x++) {
                            midcov.array[x] /= median;
                        }
                        for (int x = 0; x < leftcov.count; x++) {
                            leftcov.array[x] /= median;
                        }
                        for (int x = 0; x < rightcov.count; x++) {
                            rightcov.array[x] /= median;
                        }
                        final double norm_depth = midcov.median();
                        final int cnv = getCNVIndex(norm_depth);
                        if (cnv != CNV_UNDEFINED && cnv != 1 && getCNVIndex(leftcov.median()) == 1 && getCNVIndex(rightcov.median()) == 1) {
                            final Gt gt = new Gt();
                            gt.start = pos1 + extend;
                            gt.end = gt.start + window_size;
                            gt.sample_idx = bam_index;
                            gt.cnv = cnv;
                            gt.leftMedian = (float) leftMedian;
                            gt.midMedian = (float) median;
                            gt.rightMedian = (float) rightMedian;
                            gt.leftCov = (float) leftcov.median();
                            gt.midCov = (float) norm_depth;
                            gt.rightCov = (float) rightcov.median();
                            sorter.add(gt);
                        }
                    }
                }
            }
            sorter.setDestructiveIteration(true);
            try (CloseableIterator<Gt> iter = sorter.iterator()) {
                final EqualRangeIterator<Gt> eq = new EqualRangeIterator<>(iter, (A, B) -> A.compare0(B));
                while (eq.hasNext()) {
                    final List<Gt> row = eq.next();
                    if (row.isEmpty())
                        continue;
                    final Gt first = row.get(0);
                    final Set<Allele> alleles = new HashSet<>();
                    row.stream().flatMap(GT -> cnv2allele.apply(GT.cnv).stream()).forEach(CNV -> alleles.add(CNV));
                    alleles.add(ref_allele);
                    if (alleles.size() < 2)
                        continue;
                    final VariantContextBuilder vcb = new VariantContextBuilder(null, contig.getContig(), first.start, first.end, alleles);
                    vcb.attribute(VCFConstants.END_KEY, first.end);
                    final List<Genotype> genotypes = new ArrayList<>(samples.size());
                    for (final Gt gt : row) {
                        final GenotypeBuilder gb = new GenotypeBuilder(samples.get(gt.sample_idx).name, cnv2allele.apply(gt.cnv));
                        gb.attribute(fmtLeftMedian.getID(), (double) gt.leftMedian);
                        gb.attribute(fmtMidMedian.getID(), (double) gt.midMedian);
                        gb.attribute(fmtRightMedian.getID(), (double) gt.rightMedian);
                        gb.attribute(fmtLeftCov.getID(), (double) gt.leftCov);
                        gb.attribute(fmtMidCov.getID(), (double) gt.midCov);
                        gb.attribute(fmtRightCov.getID(), (double) gt.rightCov);
                        genotypes.add(gb.make());
                    }
                    vcb.genotypes(genotypes);
                    vcw.add(vcb.make());
                }
                eq.close();
            }
            sorter.cleanup();
        }
        vcw.close();
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        samples.forEach(S -> CloserUtil.close(S));
    }
}
Also used : WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) DataOutputStream(java.io.DataOutputStream) StringUtil(htsjdk.samtools.util.StringUtil) AbstractDataCodec(com.github.lindenb.jvarkit.util.picard.AbstractDataCodec) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) EOFException(java.io.EOFException) Collectors(java.util.stream.Collectors) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) SamReaderFactory(htsjdk.samtools.SamReaderFactory) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) DataInputStream(java.io.DataInputStream) Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) SequenceUtil(htsjdk.samtools.util.SequenceUtil) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) Function(java.util.function.Function) ValidationStringency(htsjdk.samtools.ValidationStringency) TreeSet(java.util.TreeSet) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFConstants(htsjdk.variant.vcf.VCFConstants) Locatable(htsjdk.samtools.util.Locatable) SortingCollection(htsjdk.samtools.util.SortingCollection) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) SamReader(htsjdk.samtools.SamReader) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) Closeable(java.io.Closeable) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) BufferedReader(java.io.BufferedReader) Collections(java.util.Collections) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) ArrayList(java.util.ArrayList) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) TreeSet(java.util.TreeSet) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) List(java.util.List) ArrayList(java.util.ArrayList) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) HashSet(java.util.HashSet) Genotype(htsjdk.variant.variantcontext.Genotype) CigarOperator(htsjdk.samtools.CigarOperator) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) CigarElement(htsjdk.samtools.CigarElement) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) SAMRecord(htsjdk.samtools.SAMRecord) Locatable(htsjdk.samtools.util.Locatable) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Path(java.nio.file.Path) Allele(htsjdk.variant.variantcontext.Allele) Cigar(htsjdk.samtools.Cigar) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) BufferedReader(java.io.BufferedReader)

Example 70 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class CoverageMatrix method doWork.

@Override
public int doWork(final List<String> args) {
    VariantContextWriter w = null;
    try {
        final IndexCovUtils indexCovUtils = new IndexCovUtils(this.indexCovTreshold);
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.refPath);
        final SamReaderFactory samReaderFactory = SamReaderFactory.makeDefault().referenceSequence(CoverageMatrix.this.refPath).validationStringency(ValidationStringency.LENIENT);
        final List<Path> inputBams = IOUtils.unrollPaths(args);
        if (inputBams.size() < 3) {
            LOG.error("not enough input bam file defined.");
            return -1;
        }
        final Set<String> sampleSet = new TreeSet<>();
        final List<String> idx2samples = new ArrayList<String>(inputBams.size());
        for (final Path path : inputBams) {
            try (SamReader sr = samReaderFactory.open(path)) {
                final SAMFileHeader header = sr.getFileHeader();
                final String sample = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtil.isBlank(S)).findFirst().orElse(IOUtils.getFilenameWithoutCommonSuffixes(path));
                if (sampleSet.contains(sample)) {
                    LOG.error("duplicate sample " + sample);
                    return -1;
                }
                sampleSet.add(sample);
                idx2samples.add(sample);
            }
        }
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        w = this.writingVariantsDelegate.dictionary(dict).open(this.outputFile);
        final VCFFormatHeaderLine fmtNormDepth = new VCFFormatHeaderLine("D", 1, VCFHeaderLineType.Float, "norm Depth");
        metaData.add(fmtNormDepth);
        final VCFFormatHeaderLine fmtStdDev = new VCFFormatHeaderLine("STDDEV", 1, VCFHeaderLineType.Float, "standard deviation");
        metaData.add(fmtStdDev);
        final VCFInfoHeaderLine infoStdDev = new VCFInfoHeaderLine(fmtStdDev.getID(), 1, VCFHeaderLineType.Float, "standard deviation");
        metaData.add(infoStdDev);
        final VCFInfoHeaderLine infoMedianD = new VCFInfoHeaderLine("MEDIAN", 1, VCFHeaderLineType.Float, "median depth");
        metaData.add(infoMedianD);
        final VCFInfoHeaderLine infoNSamples = new VCFInfoHeaderLine("NSAMPLES", 1, VCFHeaderLineType.Integer, "number of samples");
        metaData.add(infoNSamples);
        final VCFInfoHeaderLine infoSamples = new VCFInfoHeaderLine("SAMPLES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples");
        metaData.add(infoSamples);
        final VCFFilterHeaderLine filterAll = new VCFFilterHeaderLine("ALL_AFFECTED", "All Samples carry a variant");
        metaData.add(filterAll);
        VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.END_KEY);
        VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_KEY);
        final VCFHeader vcfheader = new VCFHeader(metaData, idx2samples);
        vcfheader.setSequenceDictionary(dict);
        JVarkitVersion.getInstance().addMetaData(this, vcfheader);
        w.writeHeader(vcfheader);
        for (final SAMSequenceRecord ssr : dict.getSequences()) {
            if (!StringUtils.isBlank(restrictContig) && !restrictContig.equals(ssr.getSequenceName()))
                continue;
            final int[] depth = new int[ssr.getSequenceLength()];
            final BitSet blackListedPositions = new BitSet(depth.length);
            // fill black listed regions
            if (this.blackListedPath != null) {
                try (TabixReader tbr = new TabixReader(this.blackListedPath.toString())) {
                    final ContigNameConverter cvt = ContigNameConverter.fromContigSet(tbr.getChromosomes());
                    final String ctg = cvt.apply(ssr.getSequenceName());
                    if (!StringUtils.isBlank(ctg)) {
                        final BedLineCodec codec = new BedLineCodec();
                        final TabixReader.Iterator tbxr = tbr.query(ctg, 1, ssr.getSequenceLength());
                        for (; ; ) {
                            final String line = tbxr.next();
                            if (line == null)
                                break;
                            final BedLine bed = codec.decode(line);
                            if (bed == null)
                                continue;
                            int p1 = Math.max(bed.getStart(), 1);
                            while (p1 <= ssr.getSequenceLength() && p1 <= bed.getEnd()) {
                                blackListedPositions.set(p1 - 1);
                                ++p1;
                            }
                        }
                    }
                } catch (Throwable err) {
                    LOG.warn(err);
                }
            }
            final SortingCollection<CovItem> sorter = SortingCollection.newInstance(CovItem.class, new CovItemCodec(), (A, B) -> A.compare0(B), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
            for (int bam_idx = 0; bam_idx < inputBams.size(); ++bam_idx) {
                final Path path = inputBams.get(bam_idx);
                LOG.info(ssr.getContig() + ":" + path + " " + bam_idx + "/" + inputBams.size());
                try (SamReader sr = samReaderFactory.open(path)) {
                    final SAMFileHeader header = sr.getFileHeader();
                    SequenceUtil.assertSequenceDictionariesEqual(dict, header.getSequenceDictionary());
                    Arrays.fill(depth, 0);
                    try (CloseableIterator<SAMRecord> siter = sr.queryOverlapping(ssr.getContig(), 1, ssr.getLengthOnReference())) {
                        while (siter.hasNext()) {
                            final SAMRecord rec = siter.next();
                            if (rec.getReadUnmappedFlag())
                                continue;
                            if (!SAMRecordDefaultFilter.accept(rec, this.min_mapq))
                                continue;
                            int ref = rec.getStart();
                            final Cigar cigar = rec.getCigar();
                            if (cigar == null)
                                continue;
                            for (CigarElement ce : cigar) {
                                final CigarOperator op = ce.getOperator();
                                final int len = ce.getLength();
                                if (op.consumesReferenceBases()) {
                                    if (op.consumesReadBases()) {
                                        for (int i = 0; i < len; i++) {
                                            final int pos = ref + i;
                                            if (pos < 1)
                                                continue;
                                            if (pos > ssr.getLengthOnReference())
                                                break;
                                            depth[pos - 1]++;
                                        }
                                    }
                                    ref += len;
                                }
                            }
                        // loop cigar
                        }
                    // end samItere
                    }
                    // try
                    final DiscreteMedian<Integer> discreteMedian = new DiscreteMedian<>();
                    int pos = 0;
                    while (pos < depth.length) {
                        if (!blackListedPositions.get(pos) && depth[pos] <= this.max_depth) {
                            discreteMedian.add(depth[pos]);
                        }
                        ++pos;
                    }
                    final double median = discreteMedian.getMedian().orElse(1.0);
                    LOG.info(idx2samples.get(bam_idx) + " :" + ssr.getSequenceName() + " median depth:" + median);
                    final DiscreteMedian<Integer> localMedian = new DiscreteMedian<>();
                    pos = 0;
                    while (pos < depth.length) {
                        if (blackListedPositions.get(pos)) /* non pas maxdepth */
                        {
                            ++pos;
                            continue;
                        }
                        int pos2 = pos;
                        localMedian.clear();
                        while (pos2 - pos < this.bin_size && pos2 < depth.length && !blackListedPositions.get(pos2)) {
                            // consider this.max_depth here ?
                            localMedian.add(depth[pos2]);
                            ++pos2;
                        }
                        if (pos2 - pos == this.bin_size) {
                            final double localMed = localMedian.getMedian().orElse(0.0);
                            final CovItem item = new CovItem();
                            item.pos = pos;
                            item.sample_idx = bam_idx;
                            item.depth = (float) (localMed / median);
                            item.stddev = (float) localMedian.getStandardDeviation().orElse(-1.0);
                            sorter.add(item);
                        }
                        pos = pos2;
                    }
                }
            // end loop over samples
            }
            // end loop over bams
            sorter.doneAdding();
            sorter.setDestructiveIteration(true);
            final CloseableIterator<CovItem> iter = sorter.iterator();
            final EqualRangeIterator<CovItem> iter2 = new EqualRangeIterator<>(iter, (A, B) -> Integer.compare(A.pos, B.pos));
            final Allele REF = Allele.create("N", true);
            final Allele DEL = Allele.create("<DEL>", false);
            final Allele DUP = Allele.create("<DUP>", false);
            while (iter2.hasNext()) {
                final List<CovItem> list = iter2.next();
                final CovItem first = list.get(0);
                final double avg_depth = list.stream().mapToDouble(F -> F.depth).average().orElse(0);
                final double sum = list.stream().mapToDouble(F -> F.depth).map(D -> Math.pow(D - avg_depth, 2.0)).sum();
                final double stdDev = Math.sqrt(sum / list.size());
                final OptionalDouble optMedianOfmedian = Percentile.median().evaluate(list.stream().mapToDouble(I -> I.depth));
                final double medianOfmedian = optMedianOfmedian.orElse(1.0);
                if (medianOfmedian <= 0)
                    continue;
                for (int i = 0; i < list.size(); i++) {
                    list.get(i).depth /= medianOfmedian;
                }
                if (list.stream().allMatch(F -> Float.isNaN(F.depth) || Float.isInfinite(F.depth)))
                    continue;
                final VariantContextBuilder vcb = new VariantContextBuilder();
                vcb.chr(ssr.getContig());
                vcb.start(first.pos + 1);
                vcb.stop(first.pos + this.bin_size);
                vcb.attribute(VCFConstants.END_KEY, first.pos + this.bin_size);
                vcb.attribute(infoStdDev.getID(), stdDev);
                vcb.attribute(infoMedianD.getID(), medianOfmedian);
                final Set<Allele> alleles = new HashSet<>();
                alleles.add(REF);
                final List<Genotype> genotypes = new ArrayList<>(list.size());
                final Set<String> affected = new TreeSet<>();
                for (int i = 0; i < list.size(); i++) {
                    final CovItem item = list.get(i);
                    final String sn = idx2samples.get(item.sample_idx);
                    final GenotypeBuilder gb;
                    switch(indexCovUtils.getType(item.depth)) {
                        case AMBIGOUS:
                            gb = new GenotypeBuilder(sn, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));
                            break;
                        case HET_DEL:
                            alleles.add(DEL);
                            gb = new GenotypeBuilder(sn, Arrays.asList(REF, DEL));
                            affected.add(sn);
                            break;
                        case HOM_DEL:
                            alleles.add(DEL);
                            gb = new GenotypeBuilder(sn, Arrays.asList(DEL, DEL));
                            affected.add(sn);
                            break;
                        case HET_DUP:
                            alleles.add(DUP);
                            gb = new GenotypeBuilder(sn, Arrays.asList(REF, DUP));
                            affected.add(sn);
                            break;
                        case HOM_DUP:
                            alleles.add(DUP);
                            gb = new GenotypeBuilder(sn, Arrays.asList(DUP, DUP));
                            affected.add(sn);
                            break;
                        case REF:
                            gb = new GenotypeBuilder(sn, Arrays.asList(REF, REF));
                            break;
                        default:
                            throw new IllegalStateException();
                    }
                    gb.attribute(fmtNormDepth.getID(), item.depth);
                    gb.attribute(fmtStdDev.getID(), item.stddev);
                    genotypes.add(gb.make());
                }
                if (affected.isEmpty())
                    continue;
                if (affected.size() == inputBams.size()) {
                    vcb.filter(filterAll.getID());
                } else {
                    vcb.passFilters();
                }
                vcb.attribute(infoSamples.getID(), new ArrayList<>(affected));
                vcb.attribute(infoNSamples.getID(), affected.size());
                vcb.genotypes(genotypes);
                vcb.alleles(alleles);
                w.add(vcb.make());
            }
            iter2.close();
            iter.close();
            sorter.cleanup();
            System.gc();
        }
        // end while iter
        w.close();
        w = null;
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(w);
    }
}
Also used : IndexCovUtils(com.github.lindenb.jvarkit.tools.structvar.indexcov.IndexCovUtils) WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) DataOutputStream(java.io.DataOutputStream) StringUtil(htsjdk.samtools.util.StringUtil) AbstractDataCodec(com.github.lindenb.jvarkit.util.picard.AbstractDataCodec) DiscreteMedian(com.github.lindenb.jvarkit.math.DiscreteMedian) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) EOFException(java.io.EOFException) Percentile(com.github.lindenb.jvarkit.math.stats.Percentile) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) SamReaderFactory(htsjdk.samtools.SamReaderFactory) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) DataInputStream(java.io.DataInputStream) Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) SequenceUtil(htsjdk.samtools.util.SequenceUtil) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) OptionalDouble(java.util.OptionalDouble) ValidationStringency(htsjdk.samtools.ValidationStringency) TreeSet(java.util.TreeSet) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) SAMRecordDefaultFilter(com.github.lindenb.jvarkit.samtools.SAMRecordDefaultFilter) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) TabixReader(htsjdk.tribble.readers.TabixReader) VCFConstants(htsjdk.variant.vcf.VCFConstants) SortingCollection(htsjdk.samtools.util.SortingCollection) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) SamReader(htsjdk.samtools.SamReader) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) BitSet(java.util.BitSet) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ArrayList(java.util.ArrayList) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) IndexCovUtils(com.github.lindenb.jvarkit.tools.structvar.indexcov.IndexCovUtils) TreeSet(java.util.TreeSet) DiscreteMedian(com.github.lindenb.jvarkit.math.DiscreteMedian) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) HashSet(java.util.HashSet) BitSet(java.util.BitSet) Genotype(htsjdk.variant.variantcontext.Genotype) CigarOperator(htsjdk.samtools.CigarOperator) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) CigarElement(htsjdk.samtools.CigarElement) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) TabixReader(htsjdk.tribble.readers.TabixReader) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SamReader(htsjdk.samtools.SamReader) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Path(java.nio.file.Path) SamReaderFactory(htsjdk.samtools.SamReaderFactory) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) OptionalDouble(java.util.OptionalDouble) Allele(htsjdk.variant.variantcontext.Allele) Cigar(htsjdk.samtools.Cigar) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder)

Aggregations

CloseableIterator (htsjdk.samtools.util.CloseableIterator)103 List (java.util.List)86 Logger (com.github.lindenb.jvarkit.util.log.Logger)85 Parameter (com.beust.jcommander.Parameter)82 Program (com.github.lindenb.jvarkit.util.jcommander.Program)78 ArrayList (java.util.ArrayList)73 Collectors (java.util.stream.Collectors)71 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)69 Path (java.nio.file.Path)69 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)66 CloserUtil (htsjdk.samtools.util.CloserUtil)64 Set (java.util.Set)64 VCFHeader (htsjdk.variant.vcf.VCFHeader)59 VariantContext (htsjdk.variant.variantcontext.VariantContext)54 IOException (java.io.IOException)53 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)51 SAMRecord (htsjdk.samtools.SAMRecord)51 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)51 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)50 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)49