Search in sources :

Example 71 with SAMRecordIterator

use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.

the class PcrSliceReads method run.

@SuppressWarnings("resource")
private int run(SamReader reader) {
    ReadClipper readClipper = new ReadClipper();
    SAMFileHeader header1 = reader.getFileHeader();
    SAMFileHeader header2 = header1.clone();
    header2.addComment(getProgramName() + " " + getVersion() + ": Processed with " + getProgramCommandLine());
    header2.setSortOrder(SortOrder.unsorted);
    for (SAMReadGroupRecord srg : header1.getReadGroups()) {
        for (Interval i : this.bedIntervals.keySet()) {
            // create new read group for this interval
            SAMReadGroupRecord rg = new SAMReadGroupRecord(srg.getId() + "_" + this.bedIntervals.get(i).getName(), srg);
            header2.addReadGroup(rg);
        }
    }
    SAMFileWriter sw = null;
    SAMRecordIterator iter = null;
    try {
        sw = writingBamArgs.openSAMFileWriter(outputFile, header2, false);
        SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1);
        iter = reader.iterator();
        while (iter.hasNext()) {
            SAMRecord rec = progress.watch(iter.next());
            if (rec.getReadUnmappedFlag()) {
                sw.addAlignment(rec);
                continue;
            }
            if (!rec.getReadPairedFlag()) {
                // @doc if the read is not a paired-end read ,  then the quality of the read is set to zero
                rec.setMappingQuality(0);
                sw.addAlignment(rec);
                continue;
            }
            if (rec.getMateUnmappedFlag()) {
                // @doc if the MATE is not mapped ,  then the quality of the current read is set to zero
                rec.setMappingQuality(0);
                sw.addAlignment(rec);
                continue;
            }
            if (!rec.getProperPairFlag()) {
                // @doc if the properly-paired flag is not set,  then the quality of the current read is set to zero
                rec.setMappingQuality(0);
                sw.addAlignment(rec);
                continue;
            }
            if (rec.getMateReferenceIndex() != rec.getReferenceIndex()) {
                // @doc if the read and the mate are not mapped on the same chromosome,  then the quality of the current read is set to zero
                rec.setMappingQuality(0);
                sw.addAlignment(rec);
                continue;
            }
            if (rec.getReadNegativeStrandFlag() == rec.getMateNegativeStrandFlag()) {
                // @doc if the read and the mate are mapped on the same strand,  then the quality of the current read is set to zero
                rec.setMappingQuality(0);
                sw.addAlignment(rec);
                continue;
            }
            int chromStart;
            int chromEnd;
            if (rec.getAlignmentStart() < rec.getMateAlignmentStart()) {
                if (rec.getReadNegativeStrandFlag()) {
                    // @doc if the read(POS) < mate(POS) and read is mapped on the negative strand, then the quality of the current read is set to zero
                    rec.setMappingQuality(0);
                    sw.addAlignment(rec);
                    continue;
                } else {
                    chromStart = rec.getAlignmentStart();
                    chromEnd = rec.getMateAlignmentStart();
                }
            } else {
                if (!rec.getReadNegativeStrandFlag()) {
                    // @doc if the read(POS) > mate(POS) and read is mapped on the positive strand, then the quality of the current read is set to zero
                    rec.setMappingQuality(0);
                    sw.addAlignment(rec);
                    continue;
                } else {
                    chromStart = rec.getMateAlignmentStart();
                    // don't use getAlignmentEnd, to be consistent with mate data
                    chromEnd = rec.getAlignmentStart();
                }
            }
            List<Interval> fragments = findIntervals(rec.getContig(), chromStart, chromEnd);
            if (fragments.isEmpty()) {
                // @doc if no BED fragment is found overlapping the read, then the quality of the read is set to zero
                rec.setMappingQuality(0);
                sw.addAlignment(rec);
                continue;
            }
            Interval best = null;
            if (fragments.size() == 1) {
                best = fragments.get(0);
            } else
                switch(this.ambiguityStrategy) {
                    case random:
                        {
                            best = fragments.get(this.random.nextInt(fragments.size()));
                            break;
                        }
                    case zero:
                        {
                            rec.setMappingQuality(0);
                            sw.addAlignment(rec);
                            continue;
                        }
                    case closer:
                        {
                            int best_distance = Integer.MAX_VALUE;
                            for (Interval frag : fragments) {
                                int distance = Math.abs(chromStart - frag.getStart()) + Math.abs(frag.getEnd() - chromEnd);
                                if (best == null || distance < best_distance) {
                                    best = frag;
                                    best_distance = distance;
                                }
                            }
                            break;
                        }
                    default:
                        throw new IllegalStateException();
                }
            if (clip_reads) {
                rec = readClipper.clip(rec, best);
                if (rec.getMappingQuality() == 0) {
                    sw.addAlignment(rec);
                    continue;
                }
            }
            SAMReadGroupRecord rg = rec.getReadGroup();
            if (rg == null) {
                throw new IOException("Read " + rec + " is missing a Read-Group ID . See http://broadinstitute.github.io/picard/ http://broadinstitute.github.io/picard/command-line-overview.html#AddOrReplaceReadGroups");
            }
            rec.setAttribute("RG", rg.getId() + "_" + best.getName());
            sw.addAlignment(rec);
        }
        progress.finish();
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(sw);
    }
}
Also used : SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SAMFileWriter(htsjdk.samtools.SAMFileWriter) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) IOException(java.io.IOException) IOException(java.io.IOException) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) Interval(htsjdk.samtools.util.Interval)

Example 72 with SAMRecordIterator

use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.

the class SoftClipAnnotator method map.

public Integer map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
    if (tracker == null)
        return 0;
    final Collection<VariantContext> VCs = tracker.getValues(variantCollection.variants, context.getLocation());
    if (VCs.isEmpty()) {
        return 0;
    }
    final Collection<ReadFilter> readFilters = super.getToolkit().getFilters();
    for (final VariantContext ctx : VCs) {
        int observed_genotypes = 0;
        int num_some_clipped_genotypes = 0;
        final List<Genotype> genotypes = new ArrayList<>(ctx.getNSamples());
        for (int i = 0; i < ctx.getNSamples(); ++i) {
            final Genotype g = ctx.getGenotype(i);
            if (!g.isCalled() || g.isNoCall() || g.isHomRef()) {
                genotypes.add(g);
                continue;
            }
            final List<SamReader> bamReaderForSample = this.sample2bam.get(g.getSampleName());
            if (bamReaderForSample.isEmpty()) {
                genotypes.add(g);
                continue;
            }
            observed_genotypes++;
            int numberOfClipsOverlapping = 0;
            for (final SamReader samReader : bamReaderForSample) {
                final SAMRecordIterator iter = samReader.query(ctx.getContig(), Math.max(0, ctx.getStart() - extend4clip), ctx.getEnd() + extend4clip, false);
                while (iter.hasNext()) {
                    final SAMRecord samRecord = iter.next();
                    if (samRecord.getReadUnmappedFlag() || samRecord.getCigar() == null || !samRecord.getContig().equals(ctx.getContig()) || samRecord.getUnclippedEnd() < ctx.getStart() || samRecord.getUnclippedStart() > ctx.getEnd() || samRecord.getReadGroup() == null || !g.getSampleName().equals(samRecord.getReadGroup().getSample()))
                        continue;
                    boolean ok = true;
                    for (final ReadFilter readFilter : readFilters) {
                        // this one cannot't be correctly initialized...
                        if (readFilter instanceof MalformedReadFilter)
                            continue;
                        if (readFilter.filterOut(samRecord)) {
                            ok = false;
                            break;
                        }
                    }
                    if (!ok)
                        continue;
                    int refPos = samRecord.getUnclippedStart();
                    for (final CigarElement ce : samRecord.getCigar()) {
                        if (ce.getOperator().consumesReferenceBases() || ce.getOperator().isClipping()) {
                            final int refEnd = refPos + ce.getLength();
                            if (!(ctx.getEnd() < refPos || refEnd < ctx.getStart())) {
                                // System.err.println("IN "+ce+" "+ctx.getStart()+"-"+ctx.getEnd()+" : "+refPos+"-"+refPos+ce.getLength());
                                if (ce.getOperator().equals(CigarOperator.S)) {
                                    ++numberOfClipsOverlapping;
                                }
                            }
                            refPos += ce.getLength();
                        }
                        if (refPos > ctx.getEnd())
                            break;
                    }
                }
                iter.close();
            }
            if (numberOfClipsOverlapping == 0) {
                genotypes.add(g);
            } else {
                GenotypeBuilder gb = new GenotypeBuilder(g);
                gb.attribute(numClipInGenotypeFormatHeaderLine.getID(), numberOfClipsOverlapping);
                genotypes.add(gb.make());
                num_some_clipped_genotypes++;
            }
        }
        if (num_some_clipped_genotypes == 0) {
            vcfWriter.add(ctx);
        } else {
            final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
            vcb.genotypes(genotypes);
            if (observed_genotypes == num_some_clipped_genotypes) {
                vcb.filter(this.filterHaveClipInVariant.getID());
            }
            vcfWriter.add(vcb.make());
        }
    }
    return VCs.size();
}
Also used : SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) CigarElement(htsjdk.samtools.CigarElement) SamReader(htsjdk.samtools.SamReader) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) MalformedReadFilter(org.broadinstitute.gatk.engine.filters.MalformedReadFilter) SAMRecord(htsjdk.samtools.SAMRecord) MalformedReadFilter(org.broadinstitute.gatk.engine.filters.MalformedReadFilter) ReadFilter(org.broadinstitute.gatk.engine.filters.ReadFilter)

Example 73 with SAMRecordIterator

use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.

the class LocalRealignReads method doWork.

@Override
public int doWork(List<String> args) {
    if (this.faidxFile == null) {
        LOG.error("REFerence file missing;");
        return -1;
    }
    IndexedFastaSequenceFile indexedFastaSequenceFile = null;
    SamReader samReader = null;
    SAMFileWriter w = null;
    SAMRecordIterator iter = null;
    GenomicSequence genomicSequence = null;
    LongestCommonSequence matrix = new LongestCommonSequence();
    try {
        indexedFastaSequenceFile = new IndexedFastaSequenceFile(this.faidxFile);
        samReader = openSamReader(oneFileOrNull(args));
        final SAMFileHeader header1 = samReader.getFileHeader();
        final SAMFileHeader header2 = header1.clone();
        header2.setSortOrder(SortOrder.unsorted);
        w = this.writingBamArgs.setReferenceFile(faidxFile).openSAMFileWriter(outputFile, header2, true);
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1);
        iter = samReader.iterator();
        while (iter.hasNext()) {
            final SAMRecord rec = progress.watch(iter.next());
            if (rec.getReadUnmappedFlag() || rec.isSecondaryOrSupplementary() || rec.getReadFailsVendorQualityCheckFlag() || rec.getDuplicateReadFlag()) {
                w.addAlignment(rec);
                continue;
            }
            final Cigar cigar = rec.getCigar();
            final int nCigarElement = cigar.numCigarElements();
            if (nCigarElement < 2) {
                w.addAlignment(rec);
                continue;
            }
            final CigarElement ce5 = cigar.getCigarElement(0);
            final CigarElement ce3 = cigar.getCigarElement(nCigarElement - 1);
            if (!((ce3.getOperator().equals(CigarOperator.S) && ce3.getLength() >= MIN_ALIGN_LEN) || (ce5.getOperator().equals(CigarOperator.S) && ce5.getLength() >= MIN_ALIGN_LEN))) {
                w.addAlignment(rec);
                continue;
            }
            if (genomicSequence == null || !genomicSequence.getChrom().equals(rec.getReferenceName())) {
                genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
            }
            final CharSequence readseq = rec.getReadString();
            /**
             * short invert
             */
            if (ce5.getOperator() == CigarOperator.S && ce5.getLength() >= MIN_ALIGN_LEN) {
                CharSequence clipseq = new SubSequence(readseq, 0, ce5.getLength());
                CharSequence revcomp = new RevCompCharSequence(clipseq);
                LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
                if (hit.size() >= MIN_ALIGN_LEN) {
                    System.err.println("REVCOMP5' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName() + " " + rec.getCigarString());
                }
            /*
					
					hit = matrix.align(
							readseq, 0, readseq.length(),
							revcomp,
							0,
							revcomp.length()
							);
					if(hit.size()>=MIN_ALIGN_LEN)
						{
						System.err.println("REVCOMP5' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
						}
					*/
            }
            if (ce3.getOperator() == CigarOperator.S && ce3.getLength() >= MIN_ALIGN_LEN) {
                CharSequence clipseq = new SubSequence(readseq, readseq.length() - ce3.getLength(), readseq.length());
                CharSequence revcomp = new RevCompCharSequence(clipseq);
                LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
                if (hit.size() >= MIN_ALIGN_LEN) {
                    System.err.println("REVCOMP3' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName());
                }
            /*
					hit = matrix.align(
							readseq, 0, readseq.length(),
							revcomp,
							0,
							revcomp.length()
							);
					if(hit.size()>=MIN_ALIGN_LEN)
						{
						System.err.println("REVCOMP3' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
						}*/
            }
            /* other */
            List<LongestCommonSequence.Hit> hits = new ArrayList<>();
            align(hits, matrix, genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), readseq, 0, readseq.length(), -1);
            if (hits.size() < 2000)
                continue;
            for (LongestCommonSequence.Hit hit : hits) {
                int readstart = 0;
                boolean in_M = false;
                for (int i = 0; i < nCigarElement; ++i) {
                    final CigarElement ce = cigar.getCigarElement(i);
                    if (ce.getOperator().consumesReadBases()) {
                        int readend = readstart + ce.getLength();
                        if (ce.getOperator() == CigarOperator.M || ce.getOperator() == CigarOperator.X || ce.getOperator() == CigarOperator.EQ) {
                            if (!(hit.getEndY() <= readstart || readend <= hit.getStartY())) {
                                in_M = true;
                                break;
                            }
                        }
                        readstart = readend;
                    }
                }
                if (in_M)
                    continue;
                int align_size = hit.size();
                System.err.println(rec.getReadName() + " " + rec.getCigarString() + " " + rec.getAlignmentStart() + "-" + rec.getAlignmentEnd());
                System.err.println("REF: " + hit.getStartX() + "-" + hit.getEndX());
                System.err.println("READ " + hit.getStartY() + "-" + hit.getEndY());
                System.err.print("REF  :");
                for (int i = 0; i < align_size; ++i) {
                    System.err.print(genomicSequence.charAt(hit.getStartX() + i));
                }
                System.err.println();
                System.err.print("READ :");
                for (int i = 0; i < align_size; ++i) {
                    System.err.print(readseq.charAt(hit.getStartY() + i));
                }
                System.err.println();
            }
            System.err.println();
        }
        progress.finish();
        return RETURN_OK;
    } catch (Exception e) {
        return wrapException(e);
    } finally {
        genomicSequence = null;
        CloserUtil.close(iter);
        CloserUtil.close(samReader);
        CloserUtil.close(w);
        CloserUtil.close(indexedFastaSequenceFile);
        matrix = null;
    }
}
Also used : SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SAMFileWriter(htsjdk.samtools.SAMFileWriter) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) RevCompCharSequence(com.github.lindenb.jvarkit.util.bio.RevCompCharSequence) ArrayList(java.util.ArrayList) LongestCommonSequence(com.github.lindenb.jvarkit.util.align.LongestCommonSequence) CigarElement(htsjdk.samtools.CigarElement) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) SamReader(htsjdk.samtools.SamReader) Cigar(htsjdk.samtools.Cigar) SubSequence(com.github.lindenb.jvarkit.lang.SubSequence) SAMRecord(htsjdk.samtools.SAMRecord) RevCompCharSequence(com.github.lindenb.jvarkit.util.bio.RevCompCharSequence) SAMFileHeader(htsjdk.samtools.SAMFileHeader)

Example 74 with SAMRecordIterator

use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.

the class ConvertBamChromosomes method doWork.

@Override
public int doWork(List<String> args) {
    SamReader sfr = null;
    SAMFileWriter sfw = null;
    final Set<String> unmappedChromosomes = new HashSet<>();
    try {
        final ContigNameConverter customMapping = ContigNameConverter.fromFile(mappingFile);
        customMapping.setOnNotFound(this.onNotFound);
        sfr = super.openSamReader(oneFileOrNull(args));
        SAMFileHeader header1 = sfr.getFileHeader();
        if (header1 == null) {
            LOG.error("File header missing");
            return -1;
        }
        SAMFileHeader header2 = header1.clone();
        // create new sequence dict
        final SAMSequenceDictionary dict1 = header1.getSequenceDictionary();
        if (dict1 == null) {
            LOG.error("Sequence dict missing");
            return -1;
        }
        final List<SAMSequenceRecord> ssrs = new ArrayList<SAMSequenceRecord>(dict1.size());
        for (int i = 0; i < dict1.size(); ++i) {
            SAMSequenceRecord ssr = dict1.getSequence(i);
            String newName = customMapping.apply(ssr.getSequenceName());
            if (newName == null) {
                // skip unknown chromosomes
                continue;
            }
            ssr = new SAMSequenceRecord(newName, ssr.getSequenceLength());
            ssrs.add(ssr);
        }
        header2.setSequenceDictionary(new SAMSequenceDictionary(ssrs));
        SAMSequenceDictionary dict2 = new SAMSequenceDictionary(ssrs);
        header2.setSequenceDictionary(dict2);
        SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict1);
        sfw = this.writingBamArgs.openSAMFileWriter(output, header2, true);
        long num_ignored = 0L;
        SAMRecordIterator iter = sfr.iterator();
        while (iter.hasNext()) {
            SAMRecord rec1 = iter.next();
            progress.watch(rec1);
            String newName1 = null;
            String newName2 = null;
            if (!SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(rec1.getReferenceName())) {
                newName1 = customMapping.apply(rec1.getReferenceName());
            }
            if (rec1.getReadPairedFlag() && !SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(rec1.getMateReferenceName())) {
                newName2 = customMapping.apply(rec1.getMateReferenceName());
            }
            rec1.setHeader(header2);
            if (!SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(rec1.getReferenceName())) {
                if (newName1 == null) {
                    ++num_ignored;
                    continue;
                }
                rec1.setReferenceName(newName1);
            }
            if (rec1.getReadPairedFlag() && !SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(rec1.getMateReferenceName())) {
                if (newName2 == null) {
                    ++num_ignored;
                    continue;
                }
                rec1.setMateReferenceName(newName2);
            }
            sfw.addAlignment(rec1);
        }
        if (!unmappedChromosomes.isEmpty()) {
            LOG.warning("Unmapped chromosomes: " + unmappedChromosomes);
        }
        LOG.warning("num ignored read:" + num_ignored);
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(sfr);
        CloserUtil.close(sfw);
    }
}
Also used : SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMFileWriter(htsjdk.samtools.SAMFileWriter) ArrayList(java.util.ArrayList) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SamReader(htsjdk.samtools.SamReader) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) HashSet(java.util.HashSet)

Example 75 with SAMRecordIterator

use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.

the class CoverageNormalizer method run.

private int run(SamReader sfr) throws IOException {
    SAMSequenceDictionary dictionary = sfr.getFileHeader().getSequenceDictionary();
    SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dictionary);
    File tmpFile1 = File.createTempFile("cov_", ".dat.gz", this.writingSortingCollection.getTmpDirectories().get(0));
    tmpFile1.deleteOnExit();
    LOG.info("Opening tmp File " + tmpFile1);
    GZIPOutputStream gos = null;
    DataInputStream dis = null;
    SortingCollection<Float> median = null;
    try {
        gos = new GZIPOutputStream(new FileOutputStream(tmpFile1));
        DataOutputStream daos = new DataOutputStream(gos);
        SAMRecordIterator iter = sfr.iterator();
        int curr_tid = -1;
        short[] array = null;
        float minCov = Float.MAX_VALUE;
        float maxCov = 0;
        int[] num_written = new int[dictionary.size()];
        Arrays.fill(num_written, 0);
        for (; ; ) {
            SAMRecord rec = null;
            if (iter.hasNext()) {
                rec = iter.next();
                if (rec.getReadUnmappedFlag())
                    continue;
                progress.watch(rec);
                if (rec.isSecondaryOrSupplementary())
                    continue;
            }
            if (rec == null || curr_tid != rec.getReferenceIndex()) {
                if (curr_tid != -1) {
                    LOG.info("Writing data for chromosome " + dictionary.getSequence(curr_tid).getSequenceName());
                    for (int i = 0; i + window_size <= array.length; i += window_shift) {
                        float sum = 0;
                        for (int j = 0; j < window_size; ++j) {
                            sum += array[i + j];
                        }
                        float v = sum / window_size;
                        daos.writeFloat(v);
                        minCov = (float) Math.min(minCov, v);
                        maxCov = (float) Math.max(maxCov, v);
                        num_written[curr_tid]++;
                    }
                    LOG.info("End writing data N=" + num_written[curr_tid]);
                }
                if (rec == null)
                    break;
                curr_tid = rec.getReferenceIndex();
                SAMSequenceRecord ssr = dictionary.getSequence(curr_tid);
                LOG.info("allocate " + ssr.getSequenceLength() + " for " + ssr.getSequenceName());
                array = null;
                System.gc();
                array = new short[ssr.getSequenceLength()];
                LOG.info("done: allocate.");
                Arrays.fill(array, (short) 0);
            }
            for (int i = rec.getAlignmentStart(); i < rec.getAlignmentEnd() && i < array.length; ++i) {
                array[i] = (short) Math.min((int) Short.MAX_VALUE, 1 + (int) array[i]);
            }
        }
        array = null;
        LOG.info("Closing BAM");
        CloserUtil.close(sfr);
        LOG.info("Closing " + tmpFile1);
        daos.flush();
        gos.finish();
        gos.flush();
        gos.close();
        // start normalizing min/max find median value
        long nWritten = 0L;
        median = SortingCollection.newInstance(Float.class, new FloatCodec(), new FloatCmp(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        median.setDestructiveIteration(true);
        dis = new DataInputStream(new GZIPInputStream(new FileInputStream(tmpFile1)));
        for (int n_items : num_written) {
            for (int i = 0; i < n_items; ++i) {
                float v = dis.readFloat();
                if (v < min_coverage)
                    continue;
                v = (float) ((v - minCov) / (double) (maxCov - minCov));
                median.add(v);
                ++nWritten;
            }
        }
        median.doneAdding();
        CloserUtil.close(dis);
        // get median
        float median_value = 0f;
        long half = nWritten / 2L;
        CloseableIterator<Float> iterFloat = median.iterator();
        while (iterFloat.hasNext() && half > 0) {
            median_value = iterFloat.next();
            --half;
            if (half <= 0) {
                LOG.info("median = " + median_value);
                break;
            }
        }
        CloserUtil.close(iterFloat);
        median.cleanup();
        median = null;
        progress = new SAMSequenceDictionaryProgress(dictionary);
        // dump data
        dis = new DataInputStream(new GZIPInputStream(new FileInputStream(tmpFile1)));
        for (int chrom_id = 0; chrom_id < num_written.length; ++chrom_id) {
            int n_items = num_written[chrom_id];
            int i = 0;
            Float value_start = null;
            while (i < n_items) {
                if (value_start == null) {
                    value_start = dis.readFloat();
                }
                int j = i + 1;
                Float value_end = null;
                while (j < n_items) {
                    value_end = dis.readFloat();
                    if (value_start.intValue() == value_end.intValue()) {
                        ++j;
                    } else {
                        break;
                    }
                }
                progress.watch(chrom_id, i * window_shift);
                if (value_start >= min_coverage) {
                    System.out.print(dictionary.getSequence(chrom_id).getSequenceName());
                    System.out.print('\t');
                    System.out.print(i * window_shift);
                    System.out.print('\t');
                    System.out.print((j - 1) * window_shift + window_size);
                    System.out.print('\t');
                    System.out.print((float) ((value_start - minCov) / (double) (maxCov - minCov)));
                    System.out.print('\t');
                    System.out.print(median_value);
                    System.out.println();
                    if (System.out.checkError())
                        break;
                }
                i = j;
                if (value_end == null)
                    break;
                value_start = value_end;
            }
        }
        CloserUtil.close(dis);
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(gos);
        CloserUtil.close(dis);
        if (tmpFile1 != null)
            tmpFile1.delete();
        if (median != null)
            median.cleanup();
    }
}
Also used : SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) DataOutputStream(java.io.DataOutputStream) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) DataInputStream(java.io.DataInputStream) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) FileInputStream(java.io.FileInputStream) IOException(java.io.IOException) GZIPInputStream(java.util.zip.GZIPInputStream) GZIPOutputStream(java.util.zip.GZIPOutputStream) FileOutputStream(java.io.FileOutputStream) SAMRecord(htsjdk.samtools.SAMRecord) File(java.io.File)

Aggregations

SAMRecordIterator (htsjdk.samtools.SAMRecordIterator)107 SAMRecord (htsjdk.samtools.SAMRecord)92 SamReader (htsjdk.samtools.SamReader)83 SAMFileHeader (htsjdk.samtools.SAMFileHeader)49 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)47 File (java.io.File)47 SAMFileWriter (htsjdk.samtools.SAMFileWriter)45 IOException (java.io.IOException)41 ArrayList (java.util.ArrayList)34 CigarElement (htsjdk.samtools.CigarElement)30 Cigar (htsjdk.samtools.Cigar)26 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)24 SamReaderFactory (htsjdk.samtools.SamReaderFactory)21 SAMReadGroupRecord (htsjdk.samtools.SAMReadGroupRecord)18 CigarOperator (htsjdk.samtools.CigarOperator)16 Interval (htsjdk.samtools.util.Interval)16 PrintWriter (java.io.PrintWriter)15 HashMap (java.util.HashMap)15 SAMFileWriterFactory (htsjdk.samtools.SAMFileWriterFactory)14 SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)14