Search in sources :

Example 1 with SAMRecordComparator

use of htsjdk.samtools.SAMRecordComparator in project jvarkit by lindenb.

the class OneThousandBam method doWork.

@Override
public int doWork(final List<String> args) {
    SAMFileWriter sfw = null;
    QueryInterval[] queryIntervalArray = null;
    try {
        if (this.bedFile == null && !this.exitAfterBaiDownload) {
            LOG.error("bed file is missing");
            return -1;
        }
        IOUtil.assertDirectoryIsWritable(this.baiCacheDir);
        if (args.isEmpty()) {
            LOG.error("no urls");
        } else if (args.size() == 1 && args.get(0).endsWith(".list")) {
            Files.lines(Paths.get(args.get(0))).filter(L -> !StringUtils.isBlank(L)).filter(L -> !L.startsWith("#")).map(L -> new Sample1KG(L)).forEach(S -> this.samples.add(S));
        } else {
            args.stream().map(L -> new Sample1KG(L)).forEach(S -> this.samples.add(S));
        }
        if (this.samples.isEmpty()) {
            LOG.error("no sample defined");
            return -1;
        }
        for (int x = 0; x < this.samples.size(); x++) {
            LOG.debug("bai " + (x + 1) + "/" + this.samples.size());
            this.samples.get(x).downloadBaiToCache();
        }
        if (this.exitAfterBaiDownload) {
            LOG.info("Bai downloaded");
            return 0;
        }
        for (final Sample1KG sample : this.samples) {
            LOG.debug("get sam file header for " + sample.url);
            try (final SamReader sr = sample.open()) {
                final SAMFileHeader header = sr.getFileHeader();
                if (!header.getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)) {
                    throw new RuntimeIOException(sample.url + " is not sorted");
                }
                sample.sampleName = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElseThrow(() -> new IllegalArgumentException("Cannot find sample in " + sample.url));
                final SAMSequenceDictionary dict = header.getSequenceDictionary();
                if (this.dict == null) {
                    LOG.debug("read bed file " + this.bedFile);
                    final List<QueryInterval> queryIntervals = new ArrayList<>();
                    this.dict = dict;
                    // load bed here
                    ContigNameConverter ctgConverter = ContigNameConverter.fromOneDictionary(this.dict);
                    final BedLineCodec codec = new BedLineCodec();
                    BufferedReader br = IOUtils.openPathForBufferedReading(this.bedFile);
                    String line;
                    while ((line = br.readLine()) != null) {
                        final BedLine bed = codec.decode(line);
                        if (bed == null)
                            continue;
                        String newCtg = ctgConverter.apply(bed.getContig());
                        if (StringUtils.isBlank(newCtg)) {
                            throw new JvarkitException.ContigNotFoundInDictionary(bed.getContig(), this.dict);
                        }
                        final int tid = this.dict.getSequenceIndex(newCtg);
                        final QueryInterval queryInterval = new QueryInterval(tid, bed.getStart(), bed.getEnd());
                        queryIntervals.add(queryInterval);
                    }
                    br.close();
                    queryIntervalArray = QueryInterval.optimizeIntervals(queryIntervals.toArray(new QueryInterval[queryIntervals.size()]));
                } else if (!SequenceUtil.areSequenceDictionariesEqual(dict, this.dict)) {
                    throw new JvarkitException.DictionariesAreNotTheSame(dict, this.dict);
                }
            }
        }
        if (queryIntervalArray == null || queryIntervalArray.length == 0) {
            LOG.error("no query interval defined");
            return -1;
        }
        final SAMRecordComparator samRecordComparator = new SAMRecordCoordinateComparator();
        final SAMFileHeader ouSamFileHeader = new SAMFileHeader(this.dict);
        ouSamFileHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
        for (final Sample1KG sample : this.samples) {
            sample.readGroupRecord = new SAMReadGroupRecord(sample.sampleName);
            sample.readGroupRecord.setSample(sample.sampleName);
            sample.readGroupRecord.setLibrary(sample.sampleName);
            sample.readGroupRecord.setAttribute("URL", sample.url);
            ouSamFileHeader.addReadGroup(sample.readGroupRecord);
        }
        final SAMFileWriterFactory sfwf = new SAMFileWriterFactory();
        sfwf.setCreateIndex(false);
        sfwf.setCreateMd5File(false);
        if (this.outputFile == null) {
            sfw = sfwf.makeBAMWriter(ouSamFileHeader, true, stdout());
        } else {
            sfw = sfwf.makeSAMOrBAMWriter(ouSamFileHeader, true, this.outputFile);
        }
        int interval_index = 0;
        while (interval_index < queryIntervalArray.length) {
            LOG.debug("interval_index= " + interval_index);
            /* current interval fetched by bam, can be updated */
            QueryInterval interval = queryIntervalArray[interval_index];
            LOG.debug("interval= " + interval);
            final List<Path> tmpFiles = new ArrayList<>(this.samples.size());
            boolean done = false;
            while (!done) {
                done = true;
                for (final Path p : tmpFiles) Files.delete(p);
                tmpFiles.clear();
                for (final Sample1KG sample : this.samples) {
                    if (!done)
                        break;
                    // try forever until it works !
                    for (; ; ) {
                        // try to download data for the current interval
                        final Path tmpBam = Files.createTempFile(this.baiCacheDir, "tmp.", ".bam");
                        SamReader srf = null;
                        SAMFileWriter rgnWriter = null;
                        SAMRecordIterator samIter = null;
                        try {
                            srf = sample.open();
                            // LOG.debug("query= "+sample.sampleName+" "+interval);
                            samIter = srf.query(new QueryInterval[] { interval }, false);
                            rgnWriter = sfwf.makeBAMWriter(ouSamFileHeader, true, tmpBam);
                            while (samIter.hasNext()) {
                                final SAMRecord rec = samIter.next();
                                if (rec.isSecondaryOrSupplementary())
                                    continue;
                                if (rec.getReadUnmappedFlag())
                                    continue;
                                if (rec.getAlignmentEnd() > interval.end && /* current read goes beyond current interval */
                                interval_index + 1 < queryIntervalArray.length && /* there is a new interval */
                                interval.referenceIndex == queryIntervalArray[interval_index + 1].referenceIndex && /* on same chromosome */
                                queryIntervalArray[interval_index + 1].start <= rec.getAlignmentEnd()) {
                                    // update current interval, merge with next
                                    interval = new QueryInterval(interval.referenceIndex, interval.start, queryIntervalArray[interval_index + 1].end);
                                    interval_index++;
                                    LOG.info("extending interval to " + interval);
                                    done = false;
                                    break;
                                }
                                rec.setAttribute(SAMTag.RG.name(), sample.readGroupRecord.getId());
                                rgnWriter.addAlignment(rec);
                            }
                            samIter.close();
                            samIter = null;
                            srf.close();
                            srf = null;
                            rgnWriter.close();
                            rgnWriter = null;
                        // LOG.debug("donequery= "+sample.url);
                        } catch (final Exception err) {
                            LOG.error(err);
                            Files.delete(tmpBam);
                            continue;
                        } finally {
                            CloserUtil.close(samIter);
                            CloserUtil.close(srf);
                            CloserUtil.close(rgnWriter);
                        }
                        tmpFiles.add(tmpBam);
                        break;
                    }
                    if (!done)
                        break;
                }
                // end of download each sample
                if (!done)
                    continue;
            }
            // end of while(!done)
            // LOG.info("merging "+interval);
            // merge everything
            final List<SamReader> chunkReaders = new ArrayList<>(samples.size());
            final List<CloseableIterator<SAMRecord>> chunkIterators = new ArrayList<>(samples.size());
            for (final Path tmpPath : tmpFiles) {
                final SamReader tmpReader = samReaderFactory.open(tmpPath);
                chunkReaders.add(tmpReader);
                chunkIterators.add(tmpReader.iterator());
            }
            final MergingIterator<SAMRecord> merger = new MergingIterator<>(samRecordComparator, chunkIterators);
            while (merger.hasNext()) {
                sfw.addAlignment(merger.next());
            }
            merger.close();
            CloserUtil.close(chunkIterators);
            CloserUtil.close(chunkReaders);
            // cleanup
            for (final Path p : tmpFiles) Files.delete(p);
            interval_index++;
        }
        sfw.close();
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
    }
}
Also used : CloseableIterator(htsjdk.samtools.util.CloseableIterator) SequenceUtil(htsjdk.samtools.util.SequenceUtil) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) MergingIterator(htsjdk.samtools.util.MergingIterator) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) IOUtil(htsjdk.samtools.util.IOUtil) URL(java.net.URL) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ArrayList(java.util.ArrayList) SAMTag(htsjdk.samtools.SAMTag) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) SAMFileWriterFactory(htsjdk.samtools.SAMFileWriterFactory) MalformedURLException(java.net.MalformedURLException) Files(java.nio.file.Files) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IOException(java.io.IOException) SAMFileWriter(htsjdk.samtools.SAMFileWriter) SamReader(htsjdk.samtools.SamReader) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) File(java.io.File) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) SamInputResource(htsjdk.samtools.SamInputResource) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) Paths(java.nio.file.Paths) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) QueryInterval(htsjdk.samtools.QueryInterval) SAMRecordCoordinateComparator(htsjdk.samtools.SAMRecordCoordinateComparator) SAMRecordComparator(htsjdk.samtools.SAMRecordComparator) BufferedReader(java.io.BufferedReader) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) SamReaderFactory(htsjdk.samtools.SamReaderFactory) InputStream(java.io.InputStream) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) ArrayList(java.util.ArrayList) QueryInterval(htsjdk.samtools.QueryInterval) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SAMRecordComparator(htsjdk.samtools.SAMRecordComparator) SamReader(htsjdk.samtools.SamReader) SAMRecordCoordinateComparator(htsjdk.samtools.SAMRecordCoordinateComparator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Path(java.nio.file.Path) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) CloseableIterator(htsjdk.samtools.util.CloseableIterator) SAMFileWriter(htsjdk.samtools.SAMFileWriter) SAMFileWriterFactory(htsjdk.samtools.SAMFileWriterFactory) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) MergingIterator(htsjdk.samtools.util.MergingIterator) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) SAMRecord(htsjdk.samtools.SAMRecord) BufferedReader(java.io.BufferedReader) SAMFileHeader(htsjdk.samtools.SAMFileHeader)

Aggregations

Parameter (com.beust.jcommander.Parameter)1 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)1 JvarkitException (com.github.lindenb.jvarkit.lang.JvarkitException)1 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)1 BedLine (com.github.lindenb.jvarkit.util.bio.bed.BedLine)1 BedLineCodec (com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec)1 ContigNameConverter (com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter)1 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)1 Program (com.github.lindenb.jvarkit.util.jcommander.Program)1 Logger (com.github.lindenb.jvarkit.util.log.Logger)1 QueryInterval (htsjdk.samtools.QueryInterval)1 SAMFileHeader (htsjdk.samtools.SAMFileHeader)1 SAMFileWriter (htsjdk.samtools.SAMFileWriter)1 SAMFileWriterFactory (htsjdk.samtools.SAMFileWriterFactory)1 SAMReadGroupRecord (htsjdk.samtools.SAMReadGroupRecord)1 SAMRecord (htsjdk.samtools.SAMRecord)1 SAMRecordComparator (htsjdk.samtools.SAMRecordComparator)1 SAMRecordCoordinateComparator (htsjdk.samtools.SAMRecordCoordinateComparator)1 SAMRecordIterator (htsjdk.samtools.SAMRecordIterator)1 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)1