Search in sources :

Example 31 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class CompareBams method doWork.

@Override
public int doWork(final List<String> args) {
    this.inputBamsList.addAll(IOUtils.unrollPaths(args));
    SortingCollection<Match> database = null;
    SamReader samFileReader = null;
    CloseableIterator<Match> iter = null;
    try {
        if (this.inputBamsList.size() < 2) {
            LOG.error("Need more bams please, got " + this.inputBamsList.size());
            return -1;
        }
        database = SortingCollection.newInstance(Match.class, new MatchCodec(), (A, B) -> matchCompare0(A, B), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        this.samSequenceDictAreTheSame = true;
        database.setDestructiveIteration(true);
        for (int currentSamFileIndex = 0; currentSamFileIndex < this.inputBamsList.size(); currentSamFileIndex++) {
            final Path samFile = this.inputBamsList.get(currentSamFileIndex);
            samFileReader = super.createSamReaderFactory().referenceSequence(this.refPath).open(samFile);
            final SAMSequenceDictionary dict = samFileReader.getFileHeader().getSequenceDictionary();
            if (dict == null || dict.isEmpty()) {
                LOG.error("Empty Dict  in " + samFile);
                return -1;
            }
            if (!this.sequenceDictionaries.isEmpty() && !SequenceUtil.areSequenceDictionariesEqual(this.sequenceDictionaries.get(0), dict)) {
                this.samSequenceDictAreTheSame = false;
                LOG.warn("FOOL !! THE SEQUENCE DICTIONARIES ARE **NOT** THE SAME. I will try to compare anyway but it will be slower.");
            }
            this.sequenceDictionaries.add(dict);
            final Optional<SimpleInterval> interval;
            if (!StringUtils.isBlank(this.REGION)) {
                interval = IntervalParserFactory.newInstance().dictionary(dict).make().apply(this.REGION);
                if (!interval.isPresent()) {
                    LOG.error("Cannot parse " + REGION + " (bad syntax or not in dictionary)");
                    return -1;
                }
            } else {
                interval = Optional.empty();
            }
            SAMRecordIterator it = null;
            if (!interval.isPresent()) {
                it = samFileReader.iterator();
            } else {
                it = samFileReader.queryOverlapping(interval.get().getContig(), interval.get().getStart(), interval.get().getEnd());
            }
            final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
            while (it.hasNext()) {
                final SAMRecord rec = progress.apply(it.next());
                if (!rec.getReadUnmappedFlag()) {
                    if (rec.getMappingQuality() < this.min_mapq)
                        continue;
                    if (!this.no_read_filtering && rec.isSecondaryOrSupplementary())
                        continue;
                }
                final Match m = new Match();
                if (rec.getReadPairedFlag()) {
                    m.num_in_pair = (rec.getFirstOfPairFlag() ? 1 : 2);
                } else {
                    m.num_in_pair = 0;
                }
                m.readName = rec.getReadName();
                m.bamIndex = currentSamFileIndex;
                m.flag = rec.getFlags();
                m.cigar = rec.getCigarString();
                if (m.cigar == null)
                    m.cigar = "";
                if (rec.getReadUnmappedFlag()) {
                    m.tid = -1;
                    m.pos = -1;
                } else {
                    m.tid = rec.getReferenceIndex();
                    m.pos = rec.getAlignmentStart();
                }
                database.add(m);
            }
            progress.close();
            it.close();
            samFileReader.close();
            samFileReader = null;
        }
        database.doneAdding();
        LOG.info("Writing results....");
        this.out = super.openPathOrStdoutAsPrintWriter(this.outputFile);
        // compute the differences for each read
        this.out.print("#READ-Name\t");
        for (int x = 0; x < this.inputBamsList.size(); ++x) {
            for (int y = x + 1; y < this.inputBamsList.size(); ++y) {
                if (!(x == 0 && y == 1))
                    this.out.print("|");
                this.out.print(inputBamsList.get(x));
                this.out.print(" ");
                this.out.print(inputBamsList.get(y));
            }
        }
        for (int x = 0; x < this.inputBamsList.size(); ++x) {
            this.out.print("\t" + inputBamsList.get(x));
        }
        this.out.println();
        /* create an array of set<Match> */
        final Comparator<Match> match_comparator = (A, B) -> matchCompare1(A, B);
        final List<Set<Match>> matches = new ArrayList<Set<CompareBams.Match>>(this.inputBamsList.size());
        while (matches.size() < this.inputBamsList.size()) {
            matches.add(new TreeSet<CompareBams.Match>(match_comparator));
        }
        iter = database.iterator();
        String currReadName = null;
        int curr_num_in_pair = -1;
        for (; ; ) {
            Match nextMatch = null;
            if (iter.hasNext()) {
                nextMatch = iter.next();
            }
            if (nextMatch == null || (currReadName != null && !currReadName.equals(nextMatch.readName)) || (curr_num_in_pair != -1 && curr_num_in_pair != nextMatch.num_in_pair)) {
                if (currReadName != null) {
                    this.out.print(currReadName);
                    if (curr_num_in_pair > 0) {
                        this.out.print("/");
                        this.out.print(curr_num_in_pair);
                    }
                    this.out.print("\t");
                    for (int x = 0; x < this.inputBamsList.size(); ++x) {
                        final Set<Match> first = matches.get(x);
                        for (int y = x + 1; y < this.inputBamsList.size(); ++y) {
                            if (!(x == 0 && y == 1))
                                this.out.print("|");
                            final Set<Match> second = matches.get(y);
                            if (same(first, second)) {
                                this.out.print("EQ");
                            } else {
                                this.out.print("NE");
                            }
                        }
                    }
                    for (int x = 0; x < this.inputBamsList.size(); ++x) {
                        this.out.print("\t");
                        print(matches.get(x), sequenceDictionaries.get(x));
                    }
                    this.out.println();
                }
                if (nextMatch == null)
                    break;
                for (final Set<Match> set : matches) set.clear();
            }
            currReadName = nextMatch.readName;
            curr_num_in_pair = nextMatch.num_in_pair;
            matches.get(nextMatch.bamIndex).add(nextMatch);
            if (this.out.checkError())
                break;
        }
        iter.close();
        this.out.flush();
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        if (database != null)
            database.cleanup();
        CloserUtil.close(samFileReader);
        CloserUtil.close(this.out);
        this.out = null;
    }
}
Also used : DataInputStream(java.io.DataInputStream) CloseableIterator(htsjdk.samtools.util.CloseableIterator) SequenceUtil(htsjdk.samtools.util.SequenceUtil) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) IntervalParserFactory(com.github.lindenb.jvarkit.samtools.util.IntervalParserFactory) TreeSet(java.util.TreeSet) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) DataOutputStream(java.io.DataOutputStream) AbstractDataCodec(com.github.lindenb.jvarkit.util.picard.AbstractDataCodec) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SortingCollection(htsjdk.samtools.util.SortingCollection) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) Set(java.util.Set) IOException(java.io.IOException) SamReader(htsjdk.samtools.SamReader) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) Optional(java.util.Optional) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Comparator(java.util.Comparator) Path(java.nio.file.Path) TreeSet(java.util.TreeSet) Set(java.util.Set) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) ArrayList(java.util.ArrayList) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SamReader(htsjdk.samtools.SamReader) SAMRecord(htsjdk.samtools.SAMRecord) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval)

Example 32 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class MergeBlastXml method doWork.

@Override
public int doWork(List<String> args) {
    if (args.isEmpty()) {
        LOG.error("input xml missing");
        return -1;
    }
    XMLEventReader rx = null;
    XMLEventReader rx2 = null;
    XMLEventWriter wx = null;
    SortingCollection<Iteration> sortingCollection = null;
    try {
        JAXBContext jc = JAXBContext.newInstance("gov.nih.nlm.ncbi.blast");
        this.unmarshaller = jc.createUnmarshaller();
        this.marshaller = jc.createMarshaller();
        this.marshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, true);
        this.marshaller.setProperty(Marshaller.JAXB_FRAGMENT, true);
        XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory();
        xmlInputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.FALSE);
        xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
        xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
        xmlInputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
        xmlInputFactory.setXMLResolver(new XMLResolver() {

            @Override
            public Object resolveEntity(String arg0, String arg1, String arg2, String arg3) throws XMLStreamException {
                LOG.info("resolveEntity:" + arg0 + "/" + arg1 + "/" + arg2);
                return null;
            }
        });
        final Comparator<Iteration> hitComparator = (A, B) -> {
            return A.getIterationQueryDef().compareTo(B.getIterationQueryDef());
        };
        sortingCollection = SortingCollection.newInstance(Iteration.class, new BlastIterationCodec(), hitComparator, this.maxRecordsInRam, this.tmpFile.toPath());
        rx = xmlInputFactory.createXMLEventReader(new FileReader(args.get(0)));
        XMLOutputFactory xof = XMLOutputFactory.newFactory();
        if (this.outputFile != null) {
            wx = xof.createXMLEventWriter(new StreamResult(this.outputFile));
        } else {
            wx = xof.createXMLEventWriter(new StreamResult(stdout()));
        }
        boolean in_iteration = false;
        while (rx.hasNext()) {
            final XMLEvent evt = rx.peek();
            if (evt.isStartElement() && evt.asStartElement().getName().getLocalPart().equals("Iteration")) {
                final Iteration iteration = this.unmarshaller.unmarshal(rx, Iteration.class).getValue();
                sortingCollection.add(iteration);
            } else if (evt.isStartElement() && evt.asStartElement().getName().getLocalPart().equals("BlastOutput_iterations")) {
                wx.add(rx.nextEvent());
                in_iteration = true;
            } else if (evt.isEndElement() && evt.asEndElement().getName().getLocalPart().equals("BlastOutput_iterations")) {
                for (int optind = 1; optind < args.size(); ++optind) {
                    LOG.info("opening " + args.get(optind));
                    rx2 = xmlInputFactory.createXMLEventReader(new FileReader(args.get(optind)));
                    while (rx2.hasNext()) {
                        XMLEvent evt2 = rx2.peek();
                        if (evt2.isStartElement() && evt2.asStartElement().getName().getLocalPart().equals("Iteration")) {
                            final Iteration iteration = this.unmarshaller.unmarshal(rx2, Iteration.class).getValue();
                            sortingCollection.add(iteration);
                        } else {
                            rx2.nextEvent();
                        }
                    }
                    rx2.close();
                    LOG.info("close");
                }
                sortingCollection.doneAdding();
                sortingCollection.setDestructiveIteration(true);
                final CloseableIterator<Iteration> coliter = sortingCollection.iterator();
                final EqualRangeIterator<Iteration> eq = new EqualRangeIterator<>(coliter, hitComparator);
                while (coliter.hasNext()) {
                    final List<Iteration> L = eq.next();
                    for (int i = 1; i < L.size(); ++i) {
                        L.get(0).getIterationHits().getHit().addAll(L.get(i).getIterationHits().getHit());
                    }
                    marshaller.marshal(L.get(0), wx);
                }
                eq.close();
                coliter.close();
                sortingCollection.cleanup();
                sortingCollection = null;
                wx.add(rx.nextEvent());
                in_iteration = false;
            } else if (in_iteration) {
                // consumme text
                rx.nextEvent();
            } else {
                wx.add(rx.nextEvent());
            }
        }
        wx.flush();
        wx.close();
        return 0;
    } catch (Exception e) {
        LOG.error(e);
        if (sortingCollection != null) {
            sortingCollection.cleanup();
        }
        return -1;
    } finally {
    }
}
Also used : DataInputStream(java.io.DataInputStream) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) XMLInputFactory(javax.xml.stream.XMLInputFactory) Marshaller(javax.xml.bind.Marshaller) StreamResult(javax.xml.transform.stream.StreamResult) StreamSource(javax.xml.transform.stream.StreamSource) XMLEventWriter(javax.xml.stream.XMLEventWriter) DataOutputStream(java.io.DataOutputStream) XMLEvent(javax.xml.stream.events.XMLEvent) AbstractDataCodec(com.github.lindenb.jvarkit.util.picard.AbstractDataCodec) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) XMLStreamException(javax.xml.stream.XMLStreamException) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Iteration(gov.nih.nlm.ncbi.blast.Iteration) XMLResolver(javax.xml.stream.XMLResolver) JAXBContext(javax.xml.bind.JAXBContext) Unmarshaller(javax.xml.bind.Unmarshaller) XMLEventReader(javax.xml.stream.XMLEventReader) SortingCollection(htsjdk.samtools.util.SortingCollection) StringWriter(java.io.StringWriter) Logger(com.github.lindenb.jvarkit.util.log.Logger) IOException(java.io.IOException) JAXBException(javax.xml.bind.JAXBException) File(java.io.File) XMLOutputFactory(javax.xml.stream.XMLOutputFactory) List(java.util.List) StringReader(java.io.StringReader) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) FileReader(java.io.FileReader) Comparator(java.util.Comparator) XMLOutputFactory(javax.xml.stream.XMLOutputFactory) Iteration(gov.nih.nlm.ncbi.blast.Iteration) JAXBContext(javax.xml.bind.JAXBContext) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) XMLEventWriter(javax.xml.stream.XMLEventWriter) XMLEventReader(javax.xml.stream.XMLEventReader) XMLResolver(javax.xml.stream.XMLResolver) FileReader(java.io.FileReader) StreamResult(javax.xml.transform.stream.StreamResult) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) XMLStreamException(javax.xml.stream.XMLStreamException) IOException(java.io.IOException) JAXBException(javax.xml.bind.JAXBException) XMLStreamException(javax.xml.stream.XMLStreamException) XMLEvent(javax.xml.stream.events.XMLEvent) XMLInputFactory(javax.xml.stream.XMLInputFactory)

Example 33 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class VcfSlidingWindowSplitter method run.

private int run(final List<String> args) {
    SortingCollection<WinAndLine> sortingcollection = null;
    BufferedReader in = null;
    FileOutputStream fos = null;
    CloseableIterator<WinAndLine> iter = null;
    ArchiveFactory archiveFactory = null;
    PrintWriter manifest = null;
    final Function<VariantContext, List<Interval>> makeWindows = (ctx) -> {
        final int ctx_start = ctx.getStart();
        final int ctx_end = ctx.getEnd();
        final List<Interval> list = new ArrayList<>();
        int right = ctx_start - ctx_start % this.window_shift;
        while (right + this.window_size >= ctx_start) {
            right -= this.window_shift;
        }
        while (right <= ctx_end) {
            final int left = right + this.window_size;
            if (right > 0 && CoordMath.overlaps(right, left, ctx_start, ctx_end)) {
                list.add(new Interval(ctx.getContig(), right, left));
            }
            right += this.window_shift;
        }
        return list;
    };
    try {
        final Path tmpVcf = Files.createTempFile("tmp.", ".vcf.gz");
        archiveFactory = ArchiveFactory.open(this.outputFile);
        manifest = new PrintWriter(this.manifestFile == null ? new NullOuputStream() : IOUtils.openPathForWriting(manifestFile));
        manifest.println("#chrom\tstart\tend\twindow\tpath\tCount_Variants");
        in = super.openBufferedReader(oneFileOrNull(args));
        final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(in);
        // read variants
        final ProgressFactory.Watcher<VariantContext> progess = ProgressFactory.newInstance().dictionary(cah.header).logger(LOG).build();
        String prevCtg = null;
        for (; ; ) {
            final String line = in.readLine();
            final VariantContext ctx = (line == null ? null : cah.codec.decode(line));
            if (ctx != null)
                progess.apply(ctx);
            if (ctx == null || !ctx.getContig().equals(prevCtg)) {
                if (sortingcollection != null) {
                    sortingcollection.doneAdding();
                    iter = sortingcollection.iterator();
                    final EqualRangeIterator<WinAndLine> eqiter = new EqualRangeIterator<>(iter, new WinAndLineComparator1());
                    while (eqiter.hasNext()) {
                        final List<WinAndLine> buffer = eqiter.next();
                        if (buffer.size() < this.min_number_of_ctx)
                            continue;
                        if (this.max_number_of_ctx > 0 && buffer.size() > this.max_number_of_ctx)
                            continue;
                        final WinAndLine first = buffer.get(0);
                        final VariantContextWriter out = VCFUtils.createVariantContextWriterToPath(tmpVcf);
                        final VCFHeader header2 = addMetaData(new VCFHeader(cah.header));
                        header2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + ".interval", first.interval.getContig() + ":" + first.interval.getStart() + "-" + first.interval.getEnd()));
                        out.writeHeader(header2);
                        int minPos = Integer.MAX_VALUE;
                        int maxPos = 0;
                        for (final WinAndLine kl : buffer) {
                            final VariantContext ctx2 = cah.codec.decode(kl.ctx);
                            minPos = Math.min(ctx2.getStart(), minPos);
                            maxPos = Math.max(ctx2.getEnd(), maxPos);
                            out.add(ctx2);
                        }
                        out.close();
                        final String md5 = StringUtils.md5(first.interval.getContig() + ":" + first.interval.getStart() + "-" + first.interval.getEnd());
                        final String filename = md5.substring(0, 2) + File.separatorChar + md5.substring(2) + File.separator + first.interval.getContig() + "_" + first.interval.getStart() + "_" + first.interval.getEnd() + ".vcf.gz";
                        try (final OutputStream os = archiveFactory.openOuputStream(filename)) {
                            IOUtils.copyTo(tmpVcf, os);
                            os.flush();
                        }
                        manifest.print(prevCtg);
                        manifest.print('\t');
                        manifest.print(minPos - 1);
                        manifest.print('\t');
                        manifest.print(maxPos);
                        manifest.print('\t');
                        manifest.print(first.interval.getContig() + ":" + first.interval.getStart() + "-" + first.interval.getEnd());
                        manifest.print('\t');
                        manifest.print((archiveFactory.isTarOrZipArchive() ? "" : this.outputFile.toString() + File.separator) + filename);
                        manifest.print('\t');
                        manifest.println(buffer.size());
                    }
                    eqiter.close();
                    iter.close();
                }
                sortingcollection = null;
                if (ctx == null)
                    break;
                prevCtg = ctx.getContig();
            }
            for (final Interval win : makeWindows.apply(ctx)) {
                if (sortingcollection == null) {
                    sortingcollection = SortingCollection.newInstance(WinAndLine.class, new WinAndLineCodec(), new WinAndLineComparator2(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
                    sortingcollection.setDestructiveIteration(true);
                }
                sortingcollection.add(new WinAndLine(win, line));
            }
        }
        progess.close();
        manifest.flush();
        manifest.close();
        archiveFactory.close();
        Files.deleteIfExists(tmpVcf);
        return RETURN_OK;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        if (sortingcollection != null)
            sortingcollection.cleanup();
        CloserUtil.close(in);
        CloserUtil.close(fos);
        CloserUtil.close(manifest);
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) DataInputStream(java.io.DataInputStream) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) VCFHeader(htsjdk.variant.vcf.VCFHeader) Function(java.util.function.Function) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) Interval(htsjdk.samtools.util.Interval) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) DataOutputStream(java.io.DataOutputStream) AbstractDataCodec(com.github.lindenb.jvarkit.util.picard.AbstractDataCodec) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) OutputStream(java.io.OutputStream) PrintWriter(java.io.PrintWriter) SortingCollection(htsjdk.samtools.util.SortingCollection) Files(java.nio.file.Files) Logger(com.github.lindenb.jvarkit.util.log.Logger) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) EOFException(java.io.EOFException) File(java.io.File) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) CoordMath(htsjdk.samtools.util.CoordMath) VariantContext(htsjdk.variant.variantcontext.VariantContext) BufferedReader(java.io.BufferedReader) Comparator(java.util.Comparator) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) DataOutputStream(java.io.DataOutputStream) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) VariantContext(htsjdk.variant.variantcontext.VariantContext) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) ArrayList(java.util.ArrayList) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) PrintWriter(java.io.PrintWriter) Path(java.nio.file.Path) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) IOException(java.io.IOException) EOFException(java.io.EOFException) FileOutputStream(java.io.FileOutputStream) BufferedReader(java.io.BufferedReader) Interval(htsjdk.samtools.util.Interval)

Example 34 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class OneThousandBam method doWork.

@Override
public int doWork(final List<String> args) {
    SAMFileWriter sfw = null;
    QueryInterval[] queryIntervalArray = null;
    try {
        if (this.bedFile == null && !this.exitAfterBaiDownload) {
            LOG.error("bed file is missing");
            return -1;
        }
        IOUtil.assertDirectoryIsWritable(this.baiCacheDir);
        if (args.isEmpty()) {
            LOG.error("no urls");
        } else if (args.size() == 1 && args.get(0).endsWith(".list")) {
            Files.lines(Paths.get(args.get(0))).filter(L -> !StringUtils.isBlank(L)).filter(L -> !L.startsWith("#")).map(L -> new Sample1KG(L)).forEach(S -> this.samples.add(S));
        } else {
            args.stream().map(L -> new Sample1KG(L)).forEach(S -> this.samples.add(S));
        }
        if (this.samples.isEmpty()) {
            LOG.error("no sample defined");
            return -1;
        }
        for (int x = 0; x < this.samples.size(); x++) {
            LOG.debug("bai " + (x + 1) + "/" + this.samples.size());
            this.samples.get(x).downloadBaiToCache();
        }
        if (this.exitAfterBaiDownload) {
            LOG.info("Bai downloaded");
            return 0;
        }
        for (final Sample1KG sample : this.samples) {
            LOG.debug("get sam file header for " + sample.url);
            try (final SamReader sr = sample.open()) {
                final SAMFileHeader header = sr.getFileHeader();
                if (!header.getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)) {
                    throw new RuntimeIOException(sample.url + " is not sorted");
                }
                sample.sampleName = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElseThrow(() -> new IllegalArgumentException("Cannot find sample in " + sample.url));
                final SAMSequenceDictionary dict = header.getSequenceDictionary();
                if (this.dict == null) {
                    LOG.debug("read bed file " + this.bedFile);
                    final List<QueryInterval> queryIntervals = new ArrayList<>();
                    this.dict = dict;
                    // load bed here
                    ContigNameConverter ctgConverter = ContigNameConverter.fromOneDictionary(this.dict);
                    final BedLineCodec codec = new BedLineCodec();
                    BufferedReader br = IOUtils.openPathForBufferedReading(this.bedFile);
                    String line;
                    while ((line = br.readLine()) != null) {
                        final BedLine bed = codec.decode(line);
                        if (bed == null)
                            continue;
                        String newCtg = ctgConverter.apply(bed.getContig());
                        if (StringUtils.isBlank(newCtg)) {
                            throw new JvarkitException.ContigNotFoundInDictionary(bed.getContig(), this.dict);
                        }
                        final int tid = this.dict.getSequenceIndex(newCtg);
                        final QueryInterval queryInterval = new QueryInterval(tid, bed.getStart(), bed.getEnd());
                        queryIntervals.add(queryInterval);
                    }
                    br.close();
                    queryIntervalArray = QueryInterval.optimizeIntervals(queryIntervals.toArray(new QueryInterval[queryIntervals.size()]));
                } else if (!SequenceUtil.areSequenceDictionariesEqual(dict, this.dict)) {
                    throw new JvarkitException.DictionariesAreNotTheSame(dict, this.dict);
                }
            }
        }
        if (queryIntervalArray == null || queryIntervalArray.length == 0) {
            LOG.error("no query interval defined");
            return -1;
        }
        final SAMRecordComparator samRecordComparator = new SAMRecordCoordinateComparator();
        final SAMFileHeader ouSamFileHeader = new SAMFileHeader(this.dict);
        ouSamFileHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
        for (final Sample1KG sample : this.samples) {
            sample.readGroupRecord = new SAMReadGroupRecord(sample.sampleName);
            sample.readGroupRecord.setSample(sample.sampleName);
            sample.readGroupRecord.setLibrary(sample.sampleName);
            sample.readGroupRecord.setAttribute("URL", sample.url);
            ouSamFileHeader.addReadGroup(sample.readGroupRecord);
        }
        final SAMFileWriterFactory sfwf = new SAMFileWriterFactory();
        sfwf.setCreateIndex(false);
        sfwf.setCreateMd5File(false);
        if (this.outputFile == null) {
            sfw = sfwf.makeBAMWriter(ouSamFileHeader, true, stdout());
        } else {
            sfw = sfwf.makeSAMOrBAMWriter(ouSamFileHeader, true, this.outputFile);
        }
        int interval_index = 0;
        while (interval_index < queryIntervalArray.length) {
            LOG.debug("interval_index= " + interval_index);
            /* current interval fetched by bam, can be updated */
            QueryInterval interval = queryIntervalArray[interval_index];
            LOG.debug("interval= " + interval);
            final List<Path> tmpFiles = new ArrayList<>(this.samples.size());
            boolean done = false;
            while (!done) {
                done = true;
                for (final Path p : tmpFiles) Files.delete(p);
                tmpFiles.clear();
                for (final Sample1KG sample : this.samples) {
                    if (!done)
                        break;
                    // try forever until it works !
                    for (; ; ) {
                        // try to download data for the current interval
                        final Path tmpBam = Files.createTempFile(this.baiCacheDir, "tmp.", ".bam");
                        SamReader srf = null;
                        SAMFileWriter rgnWriter = null;
                        SAMRecordIterator samIter = null;
                        try {
                            srf = sample.open();
                            // LOG.debug("query= "+sample.sampleName+" "+interval);
                            samIter = srf.query(new QueryInterval[] { interval }, false);
                            rgnWriter = sfwf.makeBAMWriter(ouSamFileHeader, true, tmpBam);
                            while (samIter.hasNext()) {
                                final SAMRecord rec = samIter.next();
                                if (rec.isSecondaryOrSupplementary())
                                    continue;
                                if (rec.getReadUnmappedFlag())
                                    continue;
                                if (rec.getAlignmentEnd() > interval.end && /* current read goes beyond current interval */
                                interval_index + 1 < queryIntervalArray.length && /* there is a new interval */
                                interval.referenceIndex == queryIntervalArray[interval_index + 1].referenceIndex && /* on same chromosome */
                                queryIntervalArray[interval_index + 1].start <= rec.getAlignmentEnd()) {
                                    // update current interval, merge with next
                                    interval = new QueryInterval(interval.referenceIndex, interval.start, queryIntervalArray[interval_index + 1].end);
                                    interval_index++;
                                    LOG.info("extending interval to " + interval);
                                    done = false;
                                    break;
                                }
                                rec.setAttribute(SAMTag.RG.name(), sample.readGroupRecord.getId());
                                rgnWriter.addAlignment(rec);
                            }
                            samIter.close();
                            samIter = null;
                            srf.close();
                            srf = null;
                            rgnWriter.close();
                            rgnWriter = null;
                        // LOG.debug("donequery= "+sample.url);
                        } catch (final Exception err) {
                            LOG.error(err);
                            Files.delete(tmpBam);
                            continue;
                        } finally {
                            CloserUtil.close(samIter);
                            CloserUtil.close(srf);
                            CloserUtil.close(rgnWriter);
                        }
                        tmpFiles.add(tmpBam);
                        break;
                    }
                    if (!done)
                        break;
                }
                // end of download each sample
                if (!done)
                    continue;
            }
            // end of while(!done)
            // LOG.info("merging "+interval);
            // merge everything
            final List<SamReader> chunkReaders = new ArrayList<>(samples.size());
            final List<CloseableIterator<SAMRecord>> chunkIterators = new ArrayList<>(samples.size());
            for (final Path tmpPath : tmpFiles) {
                final SamReader tmpReader = samReaderFactory.open(tmpPath);
                chunkReaders.add(tmpReader);
                chunkIterators.add(tmpReader.iterator());
            }
            final MergingIterator<SAMRecord> merger = new MergingIterator<>(samRecordComparator, chunkIterators);
            while (merger.hasNext()) {
                sfw.addAlignment(merger.next());
            }
            merger.close();
            CloserUtil.close(chunkIterators);
            CloserUtil.close(chunkReaders);
            // cleanup
            for (final Path p : tmpFiles) Files.delete(p);
            interval_index++;
        }
        sfw.close();
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
    }
}
Also used : CloseableIterator(htsjdk.samtools.util.CloseableIterator) SequenceUtil(htsjdk.samtools.util.SequenceUtil) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) MergingIterator(htsjdk.samtools.util.MergingIterator) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) IOUtil(htsjdk.samtools.util.IOUtil) URL(java.net.URL) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ArrayList(java.util.ArrayList) SAMTag(htsjdk.samtools.SAMTag) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) SAMFileWriterFactory(htsjdk.samtools.SAMFileWriterFactory) MalformedURLException(java.net.MalformedURLException) Files(java.nio.file.Files) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IOException(java.io.IOException) SAMFileWriter(htsjdk.samtools.SAMFileWriter) SamReader(htsjdk.samtools.SamReader) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) File(java.io.File) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) SamInputResource(htsjdk.samtools.SamInputResource) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) Paths(java.nio.file.Paths) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) QueryInterval(htsjdk.samtools.QueryInterval) SAMRecordCoordinateComparator(htsjdk.samtools.SAMRecordCoordinateComparator) SAMRecordComparator(htsjdk.samtools.SAMRecordComparator) BufferedReader(java.io.BufferedReader) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) SamReaderFactory(htsjdk.samtools.SamReaderFactory) InputStream(java.io.InputStream) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) ArrayList(java.util.ArrayList) QueryInterval(htsjdk.samtools.QueryInterval) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SAMRecordComparator(htsjdk.samtools.SAMRecordComparator) SamReader(htsjdk.samtools.SamReader) SAMRecordCoordinateComparator(htsjdk.samtools.SAMRecordCoordinateComparator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Path(java.nio.file.Path) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) CloseableIterator(htsjdk.samtools.util.CloseableIterator) SAMFileWriter(htsjdk.samtools.SAMFileWriter) SAMFileWriterFactory(htsjdk.samtools.SAMFileWriterFactory) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) MergingIterator(htsjdk.samtools.util.MergingIterator) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) SAMRecord(htsjdk.samtools.SAMRecord) BufferedReader(java.io.BufferedReader) SAMFileHeader(htsjdk.samtools.SAMFileHeader)

Example 35 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class ScanRetroCopy method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.min_depth < 1) {
        LOG.error("Bad min depth");
        return -1;
    }
    SamReader sr = null;
    VariantContextWriter vcw0 = null;
    CloseableIterator<SAMRecord> iter = null;
    SAMFileWriter sfw = null;
    try {
        /* load the reference genome */
        this.indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx);
        final SAMSequenceDictionary refDict = SequenceDictionaryUtils.extractRequired(this.indexedFastaSequenceFile);
        /* create a contig name converter from the REF */
        this.refCtgNameConverter = ContigNameConverter.fromOneDictionary(refDict);
        /* READ KNOWGENES FILES */
        LOG.info("Loading " + this.knownGeneUri);
        try (BufferedReader br = IOUtils.openURIForBufferedReading(this.knownGeneUri)) {
            String line;
            final CharSplitter tab = CharSplitter.TAB;
            while ((line = br.readLine()) != null) {
                if (StringUtils.isBlank(line))
                    continue;
                final String[] tokens = tab.split(line);
                final KnownGene kg = new KnownGene(tokens);
                if (kg.getExonCount() < 2)
                    continue;
                if (this.onlyCodingTranscript && kg.isNonCoding())
                    continue;
                final String ctg = this.refCtgNameConverter.apply(kg.getContig());
                if (StringUtils.isBlank(ctg))
                    continue;
                kg.setChrom(ctg);
                final Interval interval = new Interval(ctg, kg.getTxStart() + 1, kg.getTxEnd(), kg.isNegativeStrand(), kg.getName());
                List<KnownGene> L = this.knownGenesMap.get(interval);
                if (L == null) {
                    L = new ArrayList<KnownGene>();
                    this.knownGenesMap.put(interval, L);
                }
                L.add(kg);
            }
        }
        if (this.knownGenesMap.isEmpty()) {
            LOG.error("no gene found in " + this.knownGeneUri);
            return -1;
        }
        LOG.info("Number of transcripts: " + this.knownGenesMap.values().stream().flatMap(L -> L.stream()).count());
        // open the sam file
        final SamReaderFactory samReaderFactory = super.createSamReaderFactory();
        samReaderFactory.referenceSequence(this.faidx);
        sr = samReaderFactory.open(SamInputResource.of(oneFileOrNull(args)));
        final SAMFileHeader samFileHeader = sr.getFileHeader();
        // check it's sorted on coordinate
        if (!samFileHeader.getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)) {
            LOG.error("input is not sorted on coordinate but \"" + samFileHeader.getSortOrder() + "\"");
            return -1;
        }
        if (this.saveBamTo != null) {
            sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(samFileHeader, true, this.saveBamTo);
        }
        if (this.use_bai && !sr.hasIndex()) {
            LOG.warning("Cannot used bai because input is not indexed");
        }
        // if there is a bai, only query the bam in the regions of splicing
        if (this.use_bai && sr.hasIndex()) {
            LOG.info("building intervals...");
            final SAMSequenceDictionary samdict = SequenceDictionaryUtils.extractRequired(samFileHeader);
            final ContigNameConverter samConvert = ContigNameConverter.fromOneDictionary(samdict);
            final List<QueryInterval> intervalsL = this.knownGenesMap.values().stream().flatMap(K -> K.stream()).filter(KG -> samConvert.apply(KG.getContig()) != null).flatMap(KG -> KG.getExons().stream()).flatMap(exon -> {
                // we need the reads overlapping the exon bounds
                final int tid = samdict.getSequenceIndex(samConvert.apply(exon.getGene().getContig()));
                final QueryInterval q1 = new QueryInterval(tid, exon.getStart() + 1, exon.getStart() + 1);
                final QueryInterval q2 = new QueryInterval(tid, exon.getEnd(), exon.getEnd());
                return Arrays.stream(new QueryInterval[] { q1, q2 });
            }).sorted().collect(Collectors.toList());
            final QueryInterval[] intervals = QueryInterval.optimizeIntervals(intervalsL.toArray(new QueryInterval[intervalsL.size()]));
            // GC
            intervalsL.clear();
            LOG.debug("Query bam using " + intervals.length + " random access intervals. Please wait...");
            iter = sr.queryOverlapping(intervals);
        } else {
            iter = sr.iterator();
        }
        final Set<String> samples = this.partiton.getPartitions(samFileHeader);
        if (samples.isEmpty()) {
            LOG.error("No sample was defined in the read group of the input bam.");
            return -1;
        }
        /**
         * build vcf header
         */
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_QUALITY_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY, true));
        metaData.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Variation type"));
        metaData.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Variation Length"));
        metaData.add(new VCFInfoHeaderLine(ATT_BEST_MATCHING_LENGTH, 1, VCFHeaderLineType.Integer, "Best Matching length"));
        metaData.add(new VCFFormatHeaderLine(ATT_BEST_MATCHING_LENGTH, 1, VCFHeaderLineType.Integer, "Best Matching length"));
        metaData.add(new VCFFormatHeaderLine(ATT_GT_INTRON, 1, VCFHeaderLineType.String, "Introns info: (intron-0-idx,valid,dp-5,dp-3,max-len-5,max-len-3,avg-5,avg-3)*"));
        // metaData.add(new VCFFormatHeaderLine(ATT_COUNT_SUPPORTING_READS, 2,VCFHeaderLineType.Integer,"Count supporting reads [intron-left/intron-right]"));
        // metaData.add(new VCFInfoHeaderLine(ATT_RETRO_DESC, VCFHeaderLineCount.UNBOUNDED,VCFHeaderLineType.String,
        // "Retrocopy attributes: transcript-id|strand|exon-left|exon-left-bases|exon-right-bases|exon-right"));
        metaData.add(new VCFInfoHeaderLine(ATT_INSERTION, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Possible place of insertion:" + "chr:start-end|count-evidence|mate-genes|non-coding|distance"));
        metaData.add(new VCFInfoHeaderLine(ATT_KG_STRAND, 1, VCFHeaderLineType.String, "KnownGene strand."));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns for the Transcript"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns found retrocopied for the transcript"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_FRACTION, 1, VCFHeaderLineType.Float, "Fraction of introns found retrocopied for the transcript"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_INFO, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Introns found: chr|start|end|seq-left|seq-right"));
        metaData.add(new VCFFilterHeaderLine(ATT_FILTER_NONDOCODING, "Only non-coding transcripts"));
        metaData.add(new VCFInfoHeaderLine(ATT_SAMPLES, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples found. partition:" + this.partiton.name()));
        metaData.add(new VCFFilterHeaderLine(ATT_LOW_DEPTH_FILTER + this.low_depth_threshold, "Number of read is lower than :" + this.min_depth));
        metaData.add(new VCFFilterHeaderLine(ATT_NOT_ALL_INTRONS, "Not all introns were found retrocopied"));
        final VCFHeader header = new VCFHeader(metaData, samples);
        JVarkitVersion.getInstance().addMetaData(this, header);
        header.setSequenceDictionary(refDict);
        /* open vcf for writing*/
        vcw0 = super.openVariantContextWriter(this.outputFile);
        final VariantContextWriter vcw = vcw0;
        vcw.writeHeader(header);
        /* save gene writer */
        if (this.saveBedPeTo != null) {
            this.saveInsertionsPw = super.openPathOrStdoutAsPrintWriter(this.saveBedPeTo);
        } else {
            this.saveInsertionsPw = new PrintWriter(new NullOuputStream());
        }
        final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(samFileHeader).logger(LOG).build();
        while (iter.hasNext()) {
            final SAMRecord rec = progress.apply(iter.next());
            if (rec.getReadUnmappedFlag())
                continue;
            if (rec.getMappingQuality() < this.min_read_mapq)
                continue;
            if (rec.isSecondaryOrSupplementary())
                continue;
            if (rec.getDuplicateReadFlag())
                continue;
            final byte[] bases = rec.getReadBases();
            if (bases == null || bases == SAMRecord.NULL_SEQUENCE)
                continue;
            final Cigar cigar = rec.getCigar();
            if (cigar == null || cigar.numCigarElements() < 2)
                continue;
            final String refContig = this.refCtgNameConverter.apply(rec.getContig());
            if (StringUtils.isBlank(refContig))
                continue;
            boolean save_read_to_bam = false;
            /* get sample */
            final String sampleName = this.partiton.getPartion(rec, null);
            if (StringUtils.isBlank(sampleName))
                continue;
            /* this is a new reference sequence */
            if (this.genomicSequence == null || !this.genomicSequence.getChrom().equals(refContig)) {
                if (this.genomicSequence != null) {
                    /* DUMP things BEFORE changing the reference sequence!!! */
                    /* dump buffer */
                    dump(vcw, null);
                }
                /* map transcript-name to their  transcript */
                /*this.kgId2knownGenes.clear();
					this.knownGenesMap.values().
						stream().
						flatMap(L->L.stream()).
						filter(G->refContig.equals(G.getContig())).
						forEach(K->this.kgId2knownGenes.put(K.getName(), K));*/
                /* now, we can change genomicSequence */
                this.genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, refContig);
            }
            final CigarElement leftCigar = cigar.getCigarElement(0);
            final CigarElement rightCigar = cigar.getCigarElement(cigar.numCigarElements() - 1);
            /* both ends are not candidate */
            if (!isCandidateCigarElement(leftCigar) && !isCandidateCigarElement(rightCigar)) {
                continue;
            }
            final List<KnownGene> genes = this.knownGenesMap.getOverlapping(new Interval(refContig, rec.getUnclippedStart(), rec.getUnclippedEnd())).stream().flatMap(L -> L.stream()).collect(Collectors.toList());
            /* time to time dump the buffer for the transcripts before the current ones*/
            if (!genes.isEmpty()) {
                // get all genes overlapping the current set of genes
                int minTxStart = genes.stream().mapToInt(K -> K.getTxStart()).min().getAsInt();
                final int maxTxStart = genes.stream().mapToInt(K -> K.getTxEnd()).max().getAsInt();
                // update minTxtStart to get the lowest gene overlapping the set of genes
                minTxStart = this.knownGenesMap.getOverlapping(new Interval(refContig, minTxStart, maxTxStart)).stream().flatMap(L -> L.stream()).mapToInt(K -> K.getStart()).min().getAsInt();
                // not max, because we only need the 5' side
                dump(vcw, new Interval(refContig, minTxStart, minTxStart));
            }
            /* test each side of the clipped read */
            for (int side = 0; side < 2 && !genes.isEmpty(); ++side) {
                final CigarElement ce_side = (side == 0 ? leftCigar : rightCigar);
                if (!isCandidateCigarElement(ce_side))
                    continue;
                for (final KnownGene knownGene : genes) {
                    for (int exonIndex = 0; exonIndex < knownGene.getExonCount(); exonIndex++) {
                        if (side == 0) /* looking at cigar string in 5' */
                        {
                            if (exonIndex == 0)
                                continue;
                            // first 'M' element
                            final CigarLocatable cigarM = new CigarLocatable(refContig, rec, 1);
                            // first cigar element
                            final CigarLocatable cigarS = new CigarLocatable(refContig, rec, 0);
                            // current exon
                            final ExonOne exonRight = new ExonOne(knownGene.getExon(exonIndex));
                            if (!cigarM.overlaps(exonRight))
                                continue;
                            if (!(exonRight.getStart() >= cigarM.getStart()))
                                continue;
                            // get exon on the left
                            final ExonOne exonLeft = new ExonOne(knownGene.getExon(exonIndex - 1));
                            if (exonLeft.getLengthOnReference() < this.minCigarSize)
                                continue;
                            /* end of cigar 'M' can have same bases than the prev exon. */
                            final int malus = exonRight.getStart() - cigarM.getStart();
                            int genomic1 = exonLeft.getEnd() - malus;
                            if (genomic1 < exonLeft.getStart() || genomic1 > exonLeft.getEnd()) {
                                continue;
                            }
                            int matchLength = (this.use_malus ? malus : 0);
                            int readIdx0 = cigarS.size() - 1;
                            // loop over sequence
                            while (readIdx0 >= 0 && genomic1 >= exonLeft.getStart()) {
                                final char read_base = cigarS.readBaseAt0(readIdx0);
                                final char genome_base = exonLeft.charAt1(genomic1);
                                if (read_base != genome_base) {
                                    break;
                                }
                                readIdx0--;
                                matchLength++;
                                genomic1--;
                            }
                            if (matchLength < this._priv_ignoreCigarSize)
                                continue;
                            final KnownGene.Intron intron = knownGene.getIntron(exonIndex - 1);
                            // find match or create new
                            final Match match = new Match(intron, sampleName, rec, SIDE_3_PRIME, matchLength);
                            this.intronBuffer.add(match);
                            save_read_to_bam = true;
                        } else /* test last cigar */
                        {
                            if (exonIndex + 1 >= knownGene.getExonCount())
                                continue;
                            // last 'M' element
                            final CigarLocatable cigarM = new CigarLocatable(refContig, rec, cigar.numCigarElements() - 2);
                            // last cigar element
                            final CigarLocatable cigarS = new CigarLocatable(refContig, rec, cigar.numCigarElements() - 1);
                            // current exon
                            final ExonOne exonLeft = new ExonOne(knownGene.getExon(exonIndex));
                            if (!cigarM.overlaps(exonLeft))
                                continue;
                            if (!(exonLeft.getEnd() <= cigarM.getEnd()))
                                continue;
                            // get next exon
                            final ExonOne exonRight = new ExonOne(knownGene.getExon(exonIndex + 1));
                            if (exonRight.getLengthOnReference() < this.minCigarSize)
                                continue;
                            /* end of cigar 'M' can have same bases than the next exon. */
                            final int malus = cigarM.getEnd() - exonLeft.getEnd();
                            int genomic1 = exonRight.getStart() + malus;
                            if (genomic1 < exonRight.getStart() || genomic1 > exonRight.getEnd()) {
                                continue;
                            }
                            int matchLength = (this.use_malus ? malus : 0);
                            int readIdx0 = 0;
                            // loop over sequence
                            while (readIdx0 < cigarS.size() && genomic1 <= exonRight.getEnd()) {
                                final char read_base = cigarS.readBaseAt0(readIdx0);
                                final char genome_base = exonRight.charAt1(genomic1);
                                if (read_base != genome_base) {
                                    break;
                                }
                                readIdx0++;
                                matchLength++;
                                genomic1++;
                            }
                            if (matchLength < this._priv_ignoreCigarSize)
                                continue;
                            // find match or create new
                            final KnownGene.Intron intron = knownGene.getIntron(exonIndex);
                            final Match match = new Match(intron, sampleName, rec, SIDE_5_PRIME, matchLength);
                            this.intronBuffer.add(match);
                            save_read_to_bam = true;
                        }
                    }
                }
            }
            // end for side
            if (save_read_to_bam && sfw != null)
                sfw.addAlignment(rec);
        }
        /* dump buffer */
        dump(vcw, null);
        progress.close();
        vcw.close();
        iter.close();
        iter = null;
        sr.close();
        sr = null;
        this.saveInsertionsPw.flush();
        this.saveInsertionsPw.close();
        this.saveInsertionsPw = null;
        if (sfw != null) {
            sfw.close();
            sfw = null;
        }
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(sr);
        CloserUtil.close(vcw0);
        CloserUtil.close(sfw);
        CloserUtil.close(this.indexedFastaSequenceFile);
        CloserUtil.close(this.saveInsertionsPw);
    }
}
Also used : Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) AbstractCharSequence(com.github.lindenb.jvarkit.lang.AbstractCharSequence) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) SAMRecordPartition(com.github.lindenb.jvarkit.util.samtools.SAMRecordPartition) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) Map(java.util.Map) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) SAMFileWriter(htsjdk.samtools.SAMFileWriter) Collectors(java.util.stream.Collectors) SAMRecord(htsjdk.samtools.SAMRecord) ReferenceSequenceFileFactory(htsjdk.samtools.reference.ReferenceSequenceFileFactory) List(java.util.List) Stream(java.util.stream.Stream) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) CoordMath(htsjdk.samtools.util.CoordMath) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) SamReaderFactory(htsjdk.samtools.SamReaderFactory) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) IntStream(java.util.stream.IntStream) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) SAMUtils(htsjdk.samtools.SAMUtils) Parameter(com.beust.jcommander.Parameter) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) HashMap(java.util.HashMap) Function(java.util.function.Function) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Interval(htsjdk.samtools.util.Interval) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFConstants(htsjdk.variant.vcf.VCFConstants) Locatable(htsjdk.samtools.util.Locatable) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) SAMFileWriterFactory(htsjdk.samtools.SAMFileWriterFactory) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) SamReader(htsjdk.samtools.SamReader) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) File(java.io.File) SamInputResource(htsjdk.samtools.SamInputResource) QueryInterval(htsjdk.samtools.QueryInterval) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) BufferedReader(java.io.BufferedReader) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) QueryInterval(htsjdk.samtools.QueryInterval) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SamReader(htsjdk.samtools.SamReader) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) HashSet(java.util.HashSet) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) PrintWriter(java.io.PrintWriter) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) SamReaderFactory(htsjdk.samtools.SamReaderFactory) SAMFileWriter(htsjdk.samtools.SAMFileWriter) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) SAMFileWriterFactory(htsjdk.samtools.SAMFileWriterFactory) CigarElement(htsjdk.samtools.CigarElement) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Cigar(htsjdk.samtools.Cigar) SAMRecord(htsjdk.samtools.SAMRecord) BufferedReader(java.io.BufferedReader) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) SAMFileHeader(htsjdk.samtools.SAMFileHeader) Interval(htsjdk.samtools.util.Interval) QueryInterval(htsjdk.samtools.QueryInterval)

Aggregations

CloseableIterator (htsjdk.samtools.util.CloseableIterator)103 List (java.util.List)86 Logger (com.github.lindenb.jvarkit.util.log.Logger)85 Parameter (com.beust.jcommander.Parameter)82 Program (com.github.lindenb.jvarkit.util.jcommander.Program)78 ArrayList (java.util.ArrayList)73 Collectors (java.util.stream.Collectors)71 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)69 Path (java.nio.file.Path)69 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)66 CloserUtil (htsjdk.samtools.util.CloserUtil)64 Set (java.util.Set)64 VCFHeader (htsjdk.variant.vcf.VCFHeader)59 VariantContext (htsjdk.variant.variantcontext.VariantContext)54 IOException (java.io.IOException)53 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)51 SAMRecord (htsjdk.samtools.SAMRecord)51 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)51 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)50 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)49