Search in sources :

Example 56 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class CoverageFactory method getSimpleCoverage.

/**
 * @param reader
 * @param locCollection collection of items. must not be empty, all position must belong to the sampel contig
 * @param sample
 * @return
 */
public SimpleCoverage getSimpleCoverage(final SamReader reader, final Collection<? extends Locatable> locCollection, final String sample) {
    if (reader == null)
        throw new IllegalArgumentException("reader==null");
    if (!reader.hasIndex())
        throw new IllegalArgumentException("SamReader is not indexed. " + reader.getResourceDescription());
    if (locCollection.isEmpty())
        throw new IllegalArgumentException("Empty Query");
    final SAMFileHeader header = reader.getFileHeader();
    final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
    final Set<String> contigs = locCollection.stream().map(L -> L.getContig()).collect(Collectors.toSet());
    if (contigs.size() != 1)
        throw new IllegalArgumentException("Expected only one chromosome but got:" + String.join(",", contigs));
    final String contig = contigs.iterator().next();
    final int chromStart = locCollection.stream().mapToInt(L -> L.getStart()).min().orElse(0);
    final int chromEnd = locCollection.stream().mapToInt(L -> L.getEnd()).max().orElse(0);
    final Locatable loc = new SimpleInterval(contig, chromStart, chromEnd);
    final SimpleCoverageImpl cov = new SimpleCoverageImpl(loc);
    final int tid = dict.getSequenceIndex(contig);
    if (tid < 0) {
        return cov;
    }
    if (!StringUtils.isBlank(sample) && header.getReadGroups().stream().map(RG -> this.partition.apply(RG)).noneMatch(S -> sample.equals(S))) {
        return cov;
    }
    int idx = 0;
    QueryInterval[] array = new QueryInterval[locCollection.size()];
    for (final Locatable item : locCollection) {
        array[idx] = new QueryInterval(tid, item.getStart(), item.getEnd());
        idx++;
    }
    array = QueryInterval.optimizeIntervals(array);
    try (CloseableIterator<SAMRecord> iter = reader.query(array, false)) {
        while (iter.hasNext()) {
            final SAMRecord rec = iter.next();
            if (rec.getReadUnmappedFlag())
                continue;
            if (rec.getMappingQuality() < this.mappingQuality)
                continue;
            if (!this.samRecordFilter.test(rec))
                continue;
            final Cigar cigar = rec.getCigar();
            if (cigar == null || cigar.isEmpty())
                continue;
            if (!StringUtils.isBlank(sample) && !SAMRecordPartition.any.equals(this.partition)) {
                if (!sample.equals(this.partition.getPartion(rec)))
                    continue;
            }
            int maxEnd = loc.getEnd();
            if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && rec.getReferenceIndex().equals(rec.getMateReferenceIndex())) {
                int mateStart;
                if (this.useClip && SAMUtils.getMateCigar(rec) != null) {
                    mateStart = SAMUtils.getMateUnclippedStart(rec);
                } else {
                    mateStart = rec.getMateAlignmentStart();
                }
                if (rec.getStart() < mateStart && rec.getEnd() > mateStart) {
                    maxEnd = Math.min(maxEnd, mateStart);
                }
            }
            if (!this.useClip) {
                for (AlignmentBlock block : rec.getAlignmentBlocks()) {
                    if (block.getReferenceStart() > maxEnd)
                        break;
                    for (int t = 0; t < block.getLength(); t++) {
                        final int pos1 = block.getReferenceStart() + t;
                        if (pos1 < loc.getStart())
                            continue;
                        if (pos1 > maxEnd)
                            break;
                        cov.coverage[pos1 - loc.getStart()]++;
                    }
                }
            } else {
                int ref1 = rec.getUnclippedStart();
                for (final CigarElement ce : cigar) {
                    if (ref1 > maxEnd)
                        break;
                    final CigarOperator op = ce.getOperator();
                    switch(op) {
                        case P:
                            break;
                        case N:
                        case D:
                            ref1 += ce.getLength();
                            break;
                        case I:
                            break;
                        case H:
                        case S:
                        case X:
                        case EQ:
                        case M:
                            {
                                for (int i = 0; i < ce.getLength(); i++) {
                                    final int pos1 = ref1 + i;
                                    if (pos1 < loc.getStart())
                                        continue;
                                    if (pos1 > maxEnd)
                                        break;
                                    cov.coverage[pos1 - loc.getStart()]++;
                                }
                                ref1 += ce.getLength();
                                break;
                            }
                        default:
                            throw new IllegalStateException(op.name());
                    }
                }
            }
        }
    }
    return cov;
}
Also used : IntStream(java.util.stream.IntStream) Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Arrays(java.util.Arrays) SAMUtils(htsjdk.samtools.SAMUtils) CigarElement(htsjdk.samtools.CigarElement) OptionalDouble(java.util.OptionalDouble) CigarOperator(htsjdk.samtools.CigarOperator) SAMRecordPartition(com.github.lindenb.jvarkit.util.samtools.SAMRecordPartition) OptionalInt(java.util.OptionalInt) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ArrayList(java.util.ArrayList) AlignmentBlock(htsjdk.samtools.AlignmentBlock) Interval(htsjdk.samtools.util.Interval) DiscreteMedian(com.github.lindenb.jvarkit.math.DiscreteMedian) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) Locatable(htsjdk.samtools.util.Locatable) Iterator(java.util.Iterator) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Predicate(java.util.function.Predicate) Collection(java.util.Collection) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Set(java.util.Set) SamReader(htsjdk.samtools.SamReader) Collectors(java.util.stream.Collectors) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) QueryInterval(htsjdk.samtools.QueryInterval) Collections(java.util.Collections) QueryInterval(htsjdk.samtools.QueryInterval) CigarOperator(htsjdk.samtools.CigarOperator) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) CigarElement(htsjdk.samtools.CigarElement) Cigar(htsjdk.samtools.Cigar) SAMRecord(htsjdk.samtools.SAMRecord) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SAMFileHeader(htsjdk.samtools.SAMFileHeader) AlignmentBlock(htsjdk.samtools.AlignmentBlock) Locatable(htsjdk.samtools.util.Locatable)

Example 57 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class VcfLoopOverGenes method doWork.

@SuppressWarnings("resource")
@Override
public int doWork(final List<String> args) {
    PrintWriter pw = null;
    VCFReader vcfFileReader = null;
    CloseableIterator<VariantContext> iter = null;
    CloseableIterator<GeneLoc> iter2 = null;
    BufferedReader br = null;
    ArchiveFactory archive = null;
    try {
        final Path vcf = Paths.get(oneAndOnlyOneFile(args));
        vcfFileReader = VCFReaderFactory.makeDefault().open(vcf, (this.geneFile != null || !StringUtil.isBlank(this.regionStr)));
        this.dictionary = vcfFileReader.getHeader().getSequenceDictionary();
        if (this.dictionary == null) {
            throw new JvarkitException.VcfDictionaryMissing(vcf);
        }
        final VcfTools tools = new VcfTools(vcfFileReader.getHeader());
        if (!this.prefix.isEmpty() && !this.prefix.endsWith(".")) {
            this.prefix += ".";
        }
        if (this.geneFile == null) {
            final SortingCollection<GeneLoc> sortingCollection = SortingCollection.newInstance(GeneLoc.class, new GeneLocCodec(), (A, B) -> A.compareTo(B), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
            sortingCollection.setDestructiveIteration(true);
            if (StringUtil.isBlank(this.regionStr)) {
                iter = vcfFileReader.iterator();
            } else {
                final SimpleInterval interval = IntervalParserFactory.newInstance().dictionary(this.dictionary).enableWholeContig().make().apply(this.regionStr).orElseThrow(IntervalParserFactory.exception(this.regionStr));
                iter = vcfFileReader.query(interval.getContig(), interval.getStart(), interval.getEnd());
            }
            final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(vcfFileReader.getHeader()).logger(LOG);
            if (this.splitMethod.equals(SplitMethod.Annotations)) {
                while (iter.hasNext()) {
                    final VariantContext ctx = progress.watch(iter.next());
                    for (final AnnPredictionParser.AnnPrediction pred : tools.getAnnPredictionParser().getPredictions(ctx)) {
                        if (this.snpEffNoIntergenic && pred.isIntergenicRegion()) {
                            continue;
                        }
                        if (!StringUtil.isBlank(pred.getGeneName())) {
                            sortingCollection.add(create(ctx, pred.getGeneName(), SourceType.ANN_GeneName));
                        }
                        if (!StringUtil.isBlank(pred.getGeneId())) {
                            sortingCollection.add(create(ctx, pred.getGeneId(), SourceType.ANN_GeneID));
                        }
                        if (!StringUtil.isBlank(pred.getFeatureId())) {
                            sortingCollection.add(create(ctx, pred.getFeatureId(), SourceType.ANN_FeatureID));
                        }
                    }
                    for (final VepPredictionParser.VepPrediction pred : tools.getVepPredictionParser().getPredictions(ctx)) {
                        if (!StringUtil.isBlank(pred.getGene())) {
                            sortingCollection.add(create(ctx, pred.getGene(), SourceType.VEP_Gene));
                        }
                        if (!StringUtil.isBlank(pred.getFeature())) {
                            sortingCollection.add(create(ctx, pred.getFeature(), SourceType.VEP_Feature));
                        }
                        if (!StringUtil.isBlank(pred.getSymbol())) {
                            sortingCollection.add(create(ctx, pred.getSymbol(), SourceType.VEP_Symbol));
                        }
                        if (!StringUtil.isBlank(pred.getHgncId())) {
                            sortingCollection.add(create(ctx, pred.getHgncId(), SourceType.VEP_HgncId));
                        }
                    }
                }
            } else /**
             * split VCF per sliding window of variants
             */
            if (this.splitMethod.equals(SplitMethod.VariantSlidingWindow)) {
                if (this.variantsWinCount < 1) {
                    LOG.error("Bad value for variantsWinCount");
                    return -1;
                }
                if (this.variantsWinShift < 1 || this.variantsWinShift > this.variantsWinCount) {
                    LOG.error("Bad value for variantsWinShift");
                    return -1;
                }
                final List<VariantContext> buffer = new ArrayList<>(this.variantsWinCount);
                /**
                 * routine to dump buffer into sorting collection
                 */
                final Runnable dumpBuffer = () -> {
                    if (buffer.isEmpty())
                        return;
                    final String contig = buffer.get(0).getContig();
                    final int chromStart = buffer.stream().mapToInt(CTX -> CTX.getStart()).min().getAsInt();
                    // use last of start too
                    final int chromEnd0 = buffer.stream().mapToInt(CTX -> CTX.getStart()).max().getAsInt();
                    // final int chromEnd1 = buffer.stream().mapToInt(CTX->CTX.getEnd()).max().getAsInt();
                    final String identifier = contig + "_" + String.format(NUM_FORMAT, chromStart) + "_" + String.format(NUM_FORMAT, chromEnd0);
                    for (final VariantContext ctx : buffer) {
                        sortingCollection.add(create(ctx, identifier, SourceType.SlidingVariants));
                    }
                };
                while (iter.hasNext()) {
                    VariantContext ctx = progress.watch(iter.next());
                    /* reduce the memory footprint for this context */
                    ctx = new VariantContextBuilder(ctx).genotypes(Collections.emptyList()).unfiltered().rmAttributes(new ArrayList<>(ctx.getAttributes().keySet())).make();
                    if (!buffer.isEmpty() && !buffer.get(0).getContig().equals(ctx.getContig())) {
                        dumpBuffer.run();
                        buffer.clear();
                    }
                    buffer.add(ctx);
                    if (buffer.size() >= this.variantsWinCount) {
                        dumpBuffer.run();
                        final int fromIndex = Math.min(this.variantsWinShift, buffer.size());
                        buffer.subList(0, fromIndex).clear();
                    }
                }
                dumpBuffer.run();
                buffer.clear();
            } else if (this.splitMethod.equals(SplitMethod.ContigSlidingWindow)) {
                if (this.contigWinLength < 1) {
                    LOG.error("Bad value for contigWinCount");
                    return -1;
                }
                if (this.contigWinShift < 1 || this.contigWinShift > this.contigWinLength) {
                    LOG.error("Bad value for contigWinShift");
                    return -1;
                }
                while (iter.hasNext()) {
                    VariantContext ctx = progress.watch(iter.next());
                    /* reduce the memory footprint for this context */
                    ctx = new VariantContextBuilder(ctx).genotypes(Collections.emptyList()).unfiltered().rmAttributes(new ArrayList<>(ctx.getAttributes().keySet())).make();
                    int start = 0;
                    while (start <= ctx.getStart()) {
                        if (start + this.contigWinLength >= ctx.getStart()) {
                            final int chromStart = start;
                            final int chromEnd0 = start + this.contigWinLength;
                            final String identifier = ctx.getContig() + "_" + String.format(NUM_FORMAT, chromStart) + "_" + String.format(NUM_FORMAT, chromEnd0);
                            sortingCollection.add(create(ctx, identifier, SourceType.SlidingContig));
                        }
                        start += this.contigWinShift;
                    }
                }
            } else {
                throw new IllegalStateException("No such method: " + this.splitMethod);
            }
            sortingCollection.doneAdding();
            progress.finish();
            iter.close();
            iter = null;
            pw = super.openFileOrStdoutAsPrintWriter(this.outputFile);
            iter2 = sortingCollection.iterator();
            final EqualRangeIterator<GeneLoc> eqiter = new EqualRangeIterator<>(iter2, this.compareGeneName);
            int geneIdentifierId = 0;
            while (eqiter.hasNext()) {
                final List<GeneLoc> gene = eqiter.next();
                pw.print(gene.get(0).contig);
                pw.print('\t');
                // -1 for BED
                pw.print(gene.stream().mapToInt(G -> G.start).min().getAsInt() - 1);
                pw.print('\t');
                pw.print(gene.stream().mapToInt(G -> G.end).max().getAsInt());
                pw.print('\t');
                pw.print(this.prefix + String.format("%09d", ++geneIdentifierId));
                pw.print('\t');
                pw.print(gene.get(0).geneName);
                pw.print('\t');
                pw.print(gene.get(0).sourceType);
                pw.print('\t');
                pw.print(gene.size());
                pw.println();
            }
            pw.flush();
            pw.close();
            pw = null;
            eqiter.close();
            iter2.close();
            iter2 = null;
            sortingCollection.cleanup();
        } else {
            if (this.nJobs < 1) {
                this.nJobs = Math.max(1, Runtime.getRuntime().availableProcessors());
                LOG.info("setting njobs to " + this.nJobs);
            }
            final ExecutorService executorService;
            final List<Future<Integer>> futureResults;
            if (this.nJobs > 1) {
                executorService = new ThreadPoolExecutor(this.nJobs, this.nJobs, 0L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<Runnable>());
                futureResults = new ArrayList<>();
            } else {
                executorService = null;
                futureResults = Collections.emptyList();
            }
            if (this.outputFile == null) {
                LOG.error("When scanning a VCF with " + this.geneFile + ". Output file must be defined");
            }
            if (!this.exec.isEmpty()) {
                if (this.outputFile.getName().endsWith(".zip")) {
                    LOG.error("Cannot execute " + this.exec + " when saving to a zip.");
                    return -1;
                }
            }
            archive = ArchiveFactory.open(this.outputFile);
            PrintWriter manifest = this.deleteAfterCommand && !this.exec.isEmpty() ? // all files will be deleted, no manifest needed
            new PrintWriter(new NullOuputStream()) : archive.openWriter(this.prefix + "manifest.txt");
            br = IOUtils.openFileForBufferedReading(this.geneFile);
            final BedLineCodec bedCodec = new BedLineCodec();
            for (; ; ) {
                if (!futureResults.isEmpty()) {
                    int i = 0;
                    while (i < futureResults.size()) {
                        final Future<Integer> r = futureResults.get(i);
                        if (r.isCancelled()) {
                            LOG.error("Task was canceled. Break.");
                            return -1;
                        } else if (r.isDone()) {
                            futureResults.remove(i);
                            int rez = r.get();
                            if (rez != 0) {
                                LOG.error("Task Failed (" + rez + "). Break");
                            }
                        } else {
                            i++;
                        }
                    }
                }
                final String line = br.readLine();
                if (line == null)
                    break;
                if (line.startsWith("#") || line.isEmpty())
                    continue;
                final BedLine bedLine = bedCodec.decode(line);
                if (bedLine == null)
                    continue;
                // ID
                final String geneIdentifier = bedLine.get(3);
                // name
                final String geneName = bedLine.get(4);
                final SourceType sourceType = SourceType.valueOf(bedLine.get(5));
                final String filename = geneIdentifier;
                final String outputVcfName = (filename.startsWith(this.prefix) ? "" : this.prefix) + filename + ".vcf" + (this.compress ? ".gz" : "");
                LOG.info(bedLine.getContig() + ":" + bedLine.getStart() + "-" + bedLine.getEnd() + " length :" + (bedLine.getEnd() - bedLine.getStart()));
                if (bedLine.getEnd() - bedLine.getStart() > 1E6) {
                    LOG.warn("That's a large region ! " + bedLine);
                }
                OutputStream vcfOutputStream = null;
                VariantContextWriter vw = null;
                int countVariants = 0;
                final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(vcfFileReader.getHeader()).logger(LOG).prefix(geneName + " " + bedLine.getContig() + ":" + bedLine.getStart() + "-" + bedLine.getEnd());
                iter = vcfFileReader.query(bedLine.getContig(), bedLine.getStart(), bedLine.getEnd());
                while (iter.hasNext()) {
                    VariantContext ctx = progress.watch(iter.next());
                    switch(sourceType) {
                        case SlidingVariants:
                            {
                                // nothing
                                break;
                            }
                        case SlidingContig:
                            {
                                // nothing
                                break;
                            }
                        case ANN_GeneName:
                        case ANN_FeatureID:
                        case ANN_GeneID:
                            {
                                final List<String> preds = new ArrayList<>();
                                for (final AnnPredictionParser.AnnPrediction pred : tools.getAnnPredictionParser().getPredictions(ctx)) {
                                    final String predictionIdentifier;
                                    switch(sourceType) {
                                        case ANN_GeneName:
                                            predictionIdentifier = pred.getGeneName();
                                            break;
                                        case ANN_FeatureID:
                                            predictionIdentifier = pred.getFeatureId();
                                            break;
                                        case ANN_GeneID:
                                            predictionIdentifier = pred.getGeneId();
                                            break;
                                        default:
                                            throw new IllegalStateException(bedLine.toString());
                                    }
                                    if (StringUtil.isBlank(predictionIdentifier))
                                        continue;
                                    if (!geneName.equals(predictionIdentifier))
                                        continue;
                                    preds.add(pred.getOriginalAttributeAsString());
                                }
                                if (preds.isEmpty()) {
                                    ctx = null;
                                } else {
                                    ctx = new VariantContextBuilder(ctx).rmAttribute(tools.getAnnPredictionParser().getTag()).attribute(tools.getAnnPredictionParser().getTag(), preds).make();
                                }
                                break;
                            }
                        case VEP_Gene:
                        case VEP_Feature:
                        case VEP_Symbol:
                        case VEP_HgncId:
                            {
                                final List<String> preds = new ArrayList<>();
                                for (final VepPredictionParser.VepPrediction pred : tools.getVepPredictions(ctx)) {
                                    final String predictionIdentifier;
                                    switch(sourceType) {
                                        case VEP_Gene:
                                            predictionIdentifier = pred.getGene();
                                            break;
                                        case VEP_Feature:
                                            predictionIdentifier = pred.getFeature();
                                            break;
                                        case VEP_Symbol:
                                            predictionIdentifier = pred.getSymbol();
                                            break;
                                        case VEP_HgncId:
                                            predictionIdentifier = pred.getHgncId();
                                            break;
                                        default:
                                            throw new IllegalStateException(bedLine.toString());
                                    }
                                    if (StringUtil.isBlank(predictionIdentifier))
                                        continue;
                                    if (!geneName.equals(predictionIdentifier))
                                        continue;
                                    preds.add(pred.getOriginalAttributeAsString());
                                }
                                if (preds.isEmpty()) {
                                    ctx = null;
                                } else {
                                    ctx = new VariantContextBuilder(ctx).rmAttribute(tools.getVepPredictionParser().getTag()).attribute(tools.getVepPredictionParser().getTag(), preds).make();
                                }
                                break;
                            }
                        default:
                            throw new IllegalStateException(bedLine.toString());
                    }
                    if (ctx == null)
                        continue;
                    if (vcfOutputStream == null) {
                        LOG.info(filename);
                        manifest.println(outputVcfName);
                        final VCFHeader header = new VCFHeader(vcfFileReader.getHeader());
                        header.addMetaDataLine(new VCFHeaderLine(VCF_HEADER_SPLITKEY, filename));
                        vcfOutputStream = archive.openOuputStream(outputVcfName);
                        vw = VCFUtils.createVariantContextWriterToOutputStream(vcfOutputStream);
                        vw.writeHeader(header);
                    }
                    countVariants++;
                    vw.add(ctx);
                    if (countVariants % 1000 == 0) {
                        LOG.info("Loading : " + geneIdentifier + " N=" + countVariants);
                    }
                }
                progress.finish();
                LOG.info(geneIdentifier + " N=" + countVariants);
                if (vcfOutputStream != null) {
                    vw.close();
                    vcfOutputStream.flush();
                    vcfOutputStream.close();
                    vw = null;
                    if (!this.exec.isEmpty()) {
                        final Callable<Integer> callable = () -> {
                            final File vcfOutFile = new File(this.outputFile, outputVcfName);
                            IOUtil.assertFileIsReadable(vcfOutFile);
                            final String vcfPath = vcfOutFile.getPath();
                            final StringTokenizer st = new StringTokenizer(this.exec);
                            final List<String> command = new ArrayList<>(1 + st.countTokens());
                            while (st.hasMoreTokens()) {
                                String token = st.nextToken().replaceAll("__PREFIX__", this.prefix).replaceAll("__CONTIG__", bedLine.getContig()).replaceAll("__CHROM__", bedLine.getContig()).replaceAll("__ID__", geneIdentifier).replaceAll("__NAME__", geneName).replaceAll("__START__", String.valueOf(bedLine.getStart())).replaceAll("__END__", String.valueOf(bedLine.getEnd())).replaceAll("__SOURCE__", sourceType.name()).replaceAll("__VCF__", vcfPath);
                                command.add(token);
                            }
                            LOG.info(command.stream().map(S -> "'" + S + "'").collect(Collectors.joining(" ")));
                            final ProcessBuilder pb = new ProcessBuilder(command);
                            pb.redirectErrorStream(true);
                            final Process p = pb.start();
                            final Thread stdoutThread = new Thread(() -> {
                                try {
                                    InputStream in = p.getInputStream();
                                    IOUtils.copyTo(in, stdout());
                                } catch (Exception err) {
                                    LOG.error(err);
                                }
                            });
                            stdoutThread.start();
                            int exitValue = p.waitFor();
                            if (exitValue != 0) {
                                LOG.error("Command failed (" + exitValue + "):" + String.join(" ", command));
                                return -1;
                            } else {
                                if (deleteAfterCommand) {
                                    if (!vcfOutFile.delete()) {
                                        LOG.warn("Cannot delete " + vcfOutFile);
                                    }
                                }
                                return 0;
                            }
                        };
                        if (executorService != null) {
                            final Future<Integer> rez = executorService.submit(callable);
                            futureResults.add(rez);
                        } else {
                            final int ret = callable.call();
                            if (ret != 0) {
                                LOG.error("Error with process (" + ret + ")");
                                return ret;
                            }
                        }
                    }
                } else {
                    manifest.println("#" + filename);
                    LOG.warn("No Variant Found for " + line);
                }
                iter.close();
            }
            ;
            if (executorService != null) {
                LOG.info("shutdown");
                executorService.shutdown();
                executorService.awaitTermination(365, TimeUnit.DAYS);
            }
            br.close();
            br = null;
            manifest.close();
            archive.close();
            archive = null;
            LOG.info("Done");
        }
        vcfFileReader.close();
        vcfFileReader = null;
        return 0;
    } catch (Exception e) {
        LOG.error(e);
        return -1;
    } finally {
        {
            CloserUtil.close(iter2);
            CloserUtil.close(iter);
            CloserUtil.close(pw);
            CloserUtil.close(vcfFileReader);
            CloserUtil.close(br);
            CloserUtil.close(archive);
        }
    }
}
Also used : Program(com.github.lindenb.jvarkit.util.jcommander.Program) IOUtil(htsjdk.samtools.util.IOUtil) VCFHeader(htsjdk.variant.vcf.VCFHeader) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) IntervalParserFactory(com.github.lindenb.jvarkit.samtools.util.IntervalParserFactory) Future(java.util.concurrent.Future) DataOutputStream(java.io.DataOutputStream) StringUtil(htsjdk.samtools.util.StringUtil) AbstractDataCodec(com.github.lindenb.jvarkit.util.picard.AbstractDataCodec) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) Logger(com.github.lindenb.jvarkit.util.log.Logger) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VcfTools(com.github.lindenb.jvarkit.util.vcf.VcfTools) VariantContext(htsjdk.variant.variantcontext.VariantContext) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) DataInputStream(java.io.DataInputStream) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) CloseableIterator(htsjdk.samtools.util.CloseableIterator) ThreadPoolExecutor(java.util.concurrent.ThreadPoolExecutor) Parameter(com.beust.jcommander.Parameter) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) AnnPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) Callable(java.util.concurrent.Callable) Function(java.util.function.Function) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) StringTokenizer(java.util.StringTokenizer) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) ExecutorService(java.util.concurrent.ExecutorService) OutputStream(java.io.OutputStream) SortingCollection(htsjdk.samtools.util.SortingCollection) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFReader(htsjdk.variant.vcf.VCFReader) IOException(java.io.IOException) File(java.io.File) TimeUnit(java.util.concurrent.TimeUnit) Paths(java.nio.file.Paths) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) BufferedReader(java.io.BufferedReader) Comparator(java.util.Comparator) Collections(java.util.Collections) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) InputStream(java.io.InputStream) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) DataOutputStream(java.io.DataOutputStream) OutputStream(java.io.OutputStream) ArrayList(java.util.ArrayList) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) List(java.util.List) ArrayList(java.util.ArrayList) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) VcfTools(com.github.lindenb.jvarkit.util.vcf.VcfTools) File(java.io.File) AnnPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFReader(htsjdk.variant.vcf.VCFReader) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) PrintWriter(java.io.PrintWriter) Path(java.nio.file.Path) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) DataInputStream(java.io.DataInputStream) InputStream(java.io.InputStream) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) IOException(java.io.IOException) StringTokenizer(java.util.StringTokenizer) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) BufferedReader(java.io.BufferedReader) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) ThreadPoolExecutor(java.util.concurrent.ThreadPoolExecutor)

Example 58 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class Biostar78285 method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.gc_percent_window < 1) {
        LOG.error("Bad GC% window size:" + this.gc_percent_window);
        return -1;
    }
    final List<File> bamFiles = IOUtil.unrollFiles(args.stream().map(F -> new File(F)).collect(Collectors.toCollection(HashSet::new)), ".bam");
    SAMSequenceDictionary dict = null;
    final List<SamReader> samReaders = new ArrayList<>();
    final List<CloseableIterator<SAMRecord>> samIterators = new ArrayList<>();
    final TreeSet<String> samples = new TreeSet<>();
    final String DEFAULT_PARTITION = "UNDEFINED_PARTITION";
    ReferenceSequenceFile indexedFastaSequenceFile = null;
    VariantContextWriter out = null;
    try {
        final SamReaderFactory samReaderFactory = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.LENIENT);
        for (final File bamFile : bamFiles) {
            LOG.info("Opening " + bamFile);
            final SamReader samReader = samReaderFactory.open(bamFile);
            samReaders.add(samReader);
            final SAMFileHeader header = samReader.getFileHeader();
            if (header == null) {
                LOG.error("No header in " + bamFile);
                return -1;
            }
            JvarkitException.BamBadSortOrder.verify(SortOrder.coordinate, header);
            samples.addAll(header.getReadGroups().stream().map(RG -> this.partition.apply(RG, DEFAULT_PARTITION)).collect(Collectors.toSet()));
            final SAMSequenceDictionary currDict = header.getSequenceDictionary();
            if (currDict == null) {
                LOG.error("SamFile doesn't contain a SAMSequenceDictionary : " + bamFile);
                return -1;
            }
            if (dict == null) {
                dict = currDict;
            } else if (!SequenceUtil.areSequenceDictionariesEqual(dict, currDict)) {
                LOG.error(JvarkitException.DictionariesAreNotTheSame.getMessage(dict, currDict));
                return -1;
            }
        }
        if (samReaders.isEmpty()) {
            LOG.error("no bam");
            return -1;
        }
        if (dict == null) {
            LOG.error("no dictionary");
            return -1;
        }
        final QueryInterval[] intervals;
        if (this.captureBed != null) {
            LOG.info("Opening " + this.captureBed);
            ContigNameConverter.setDefaultAliases(dict);
            final List<QueryInterval> L = new ArrayList<>();
            try (BedLineReader li = new BedLineReader(this.captureBed)) {
                while (li.hasNext()) {
                    final BedLine bed = li.next();
                    if (bed == null)
                        continue;
                    final QueryInterval q = bed.toQueryInterval(dict);
                    L.add(q);
                }
            }
            intervals = QueryInterval.optimizeIntervals(L.toArray(new QueryInterval[L.size()]));
        } else {
            intervals = null;
        }
        for (final SamReader samReader : samReaders) {
            LOG.info("querying " + samReader.getResourceDescription());
            final CloseableIterator<SAMRecord> iter;
            if (intervals == null) {
                iter = samReader.iterator();
            } else {
                iter = samReader.queryOverlapping(intervals);
            }
            samIterators.add(new FilterIterator<SAMRecord>(iter, R -> !R.getReadUnmappedFlag() && !filter.filterOut(R)));
        }
        if (this.refFile != null) {
            indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.refFile);
            final SAMSequenceDictionary refdict = SequenceDictionaryUtils.extractRequired(indexedFastaSequenceFile);
            ContigNameConverter.setDefaultAliases(refdict);
            if (!SequenceUtil.areSequenceDictionariesEqual(dict, refdict)) {
                LOG.error(JvarkitException.DictionariesAreNotTheSame.getMessage(dict, refdict));
                return -1;
            }
        }
        out = openVariantContextWriter(this.outputFile);
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_KEY);
        VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.DEPTH_KEY);
        metaData.add(new VCFFormatHeaderLine("DF", 1, VCFHeaderLineType.Integer, "Number of Reads on plus strand"));
        metaData.add(new VCFFormatHeaderLine("DR", 1, VCFHeaderLineType.Integer, "Number of Reads on minus strand"));
        metaData.add(new VCFInfoHeaderLine("AVG_DP", 1, VCFHeaderLineType.Float, "Mean depth"));
        metaData.add(new VCFInfoHeaderLine("MEDIAN_DP", 1, VCFHeaderLineType.Float, "Median depth"));
        metaData.add(new VCFInfoHeaderLine("MIN_DP", 1, VCFHeaderLineType.Integer, "Min depth"));
        metaData.add(new VCFInfoHeaderLine("MAX_DP", 1, VCFHeaderLineType.Integer, "Max depth"));
        metaData.add(new VCFHeaderLine(Biostar78285.class.getSimpleName() + ".SamFilter", this.filter.toString()));
        for (final Integer treshold : this.minDepthTresholds) {
            metaData.add(new VCFFilterHeaderLine("DP_LT_" + treshold, "All  genotypes have DP< " + treshold));
            metaData.add(new VCFInfoHeaderLine("NUM_DP_LT_" + treshold, 1, VCFHeaderLineType.Integer, "Number of genotypes having DP< " + treshold));
            metaData.add(new VCFInfoHeaderLine("FRACT_DP_LT_" + treshold, 1, VCFHeaderLineType.Float, "Fraction of genotypes having DP< " + treshold));
        }
        if (indexedFastaSequenceFile != null) {
            metaData.add(new VCFInfoHeaderLine("GC_PERCENT", 1, VCFHeaderLineType.Integer, "GC% window_size:" + this.gc_percent_window));
        }
        final List<Allele> refAlleles = Collections.singletonList(Allele.create("N", true));
        final List<Allele> NO_CALLS = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
        final VCFHeader vcfHeader = new VCFHeader(metaData, samples);
        vcfHeader.setSequenceDictionary(dict);
        out.writeHeader(vcfHeader);
        final SAMRecordCoordinateComparator samRecordCoordinateComparator = new SAMRecordCoordinateComparator();
        final PeekableIterator<SAMRecord> peekIter = new PeekableIterator<>(new MergingIterator<>((R1, R2) -> samRecordCoordinateComparator.fileOrderCompare(R1, R2), samIterators));
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
        for (final SAMSequenceRecord ssr : dict.getSequences()) {
            final IntervalTree<Boolean> capturePos;
            if (intervals != null) {
                if (!Arrays.stream(intervals).anyMatch(I -> I.referenceIndex == ssr.getSequenceIndex())) {
                    continue;
                }
                capturePos = new IntervalTree<>();
                Arrays.stream(intervals).filter(I -> I.referenceIndex == ssr.getSequenceIndex()).forEach(I -> capturePos.put(I.start, I.end, true));
                ;
            } else {
                capturePos = null;
            }
            final GenomicSequence genomicSequence;
            if (indexedFastaSequenceFile != null && indexedFastaSequenceFile.getSequenceDictionary().getSequence(ssr.getSequenceName()) != null) {
                genomicSequence = new GenomicSequence(indexedFastaSequenceFile, ssr.getSequenceName());
            } else {
                genomicSequence = null;
            }
            final List<SAMRecord> buffer = new ArrayList<>();
            for (int ssr_pos = 1; ssr_pos <= ssr.getSequenceLength(); ++ssr_pos) {
                if (capturePos != null && !capturePos.overlappers(ssr_pos, ssr_pos).hasNext())
                    continue;
                progress.watch(ssr.getSequenceName(), ssr_pos);
                while (peekIter.hasNext()) {
                    final SAMRecord rec = peekIter.peek();
                    if (rec.getReadUnmappedFlag()) {
                        // consumme
                        peekIter.next();
                        continue;
                    }
                    if (this.filter.filterOut(rec)) {
                        // consumme
                        peekIter.next();
                        continue;
                    }
                    if (rec.getReferenceIndex() < ssr.getSequenceIndex()) {
                        throw new IllegalStateException("should not happen");
                    }
                    if (rec.getReferenceIndex() > ssr.getSequenceIndex()) {
                        break;
                    }
                    if (rec.getAlignmentEnd() < ssr_pos) {
                        throw new IllegalStateException("should not happen");
                    }
                    if (rec.getAlignmentStart() > ssr_pos) {
                        break;
                    }
                    buffer.add(peekIter.next());
                }
                int x = 0;
                while (x < buffer.size()) {
                    final SAMRecord R = buffer.get(x);
                    if (R.getReferenceIndex() != ssr.getSequenceIndex() || R.getAlignmentEnd() < ssr_pos) {
                        buffer.remove(x);
                    } else {
                        x++;
                    }
                }
                final Map<String, PosInfo> count = samples.stream().map(S -> new PosInfo(S)).collect(Collectors.toMap(P -> P.sample, Function.identity()));
                for (final SAMRecord rec : buffer) {
                    if (rec.getReferenceIndex() != ssr.getSequenceIndex())
                        throw new IllegalStateException("should not happen");
                    if (rec.getAlignmentEnd() < ssr_pos)
                        continue;
                    if (rec.getAlignmentStart() > ssr_pos)
                        continue;
                    final Cigar cigar = rec.getCigar();
                    if (cigar == null)
                        continue;
                    int refpos = rec.getAlignmentStart();
                    final String sample = this.partition.getPartion(rec, DEFAULT_PARTITION);
                    for (final CigarElement ce : cigar.getCigarElements()) {
                        if (refpos > ssr_pos)
                            break;
                        final CigarOperator op = ce.getOperator();
                        if (op.consumesReferenceBases()) {
                            if (op.consumesReadBases()) {
                                if (refpos <= ssr_pos && ssr_pos <= refpos + ce.getLength()) {
                                    final PosInfo posInfo = count.get(sample);
                                    if (posInfo != null) {
                                        posInfo.dp++;
                                        if (rec.getReadNegativeStrandFlag()) {
                                            posInfo.negative_strand++;
                                        }
                                    }
                                    break;
                                }
                            }
                            refpos += ce.getLength();
                        }
                    }
                }
                final VariantContextBuilder vcb = new VariantContextBuilder();
                final Set<String> filters = new HashSet<>();
                vcb.chr(ssr.getSequenceName());
                vcb.start(ssr_pos);
                vcb.stop(ssr_pos);
                if (genomicSequence == null) {
                    vcb.alleles(refAlleles);
                } else {
                    vcb.alleles(Collections.singletonList(Allele.create((byte) genomicSequence.charAt(ssr_pos - 1), true)));
                    final GenomicSequence.GCPercent gcp = genomicSequence.getGCPercent(Math.max((ssr_pos - 1) - this.gc_percent_window, 0), Math.min(ssr_pos + this.gc_percent_window, ssr.getSequenceLength()));
                    if (!gcp.isEmpty()) {
                        vcb.attribute("GC_PERCENT", gcp.getGCPercentAsInteger());
                    }
                }
                vcb.attribute(VCFConstants.DEPTH_KEY, count.values().stream().mapToInt(S -> S.dp).sum());
                vcb.genotypes(count.values().stream().map(C -> new GenotypeBuilder(C.sample, NO_CALLS).DP(C.dp).attribute("DR", C.negative_strand).attribute("DF", C.dp - C.negative_strand).make()).collect(Collectors.toList()));
                for (final Integer treshold : this.minDepthTresholds) {
                    final int count_lt = (int) count.values().stream().filter(S -> S.dp < treshold).count();
                    if (count_lt == samples.size()) {
                        filters.add("DP_LT_" + treshold);
                    }
                    vcb.attribute("NUM_DP_LT_" + treshold, count_lt);
                    if (!samples.isEmpty()) {
                        vcb.attribute("FRACT_DP_LT_" + treshold, count_lt / (float) samples.size());
                    }
                }
                if (!samples.isEmpty()) {
                    final int[] array = count.values().stream().mapToInt(S -> S.dp).toArray();
                    vcb.attribute("AVG_DP", Percentile.average().evaluate(array).getAsDouble());
                    vcb.attribute("MEDIAN_DP", Percentile.median().evaluate(array).getAsDouble());
                    vcb.attribute("MIN_DP", (int) Percentile.min().evaluate(array).getAsDouble());
                    vcb.attribute("MAX_DP", (int) Percentile.max().evaluate(array).getAsDouble());
                }
                if (filters.isEmpty()) {
                    vcb.passFilters();
                } else {
                    vcb.filters(filters);
                }
                out.add(vcb.make());
            }
        }
        progress.finish();
        peekIter.close();
        out.close();
        out = null;
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(out);
        CloserUtil.close(samIterators);
        CloserUtil.close(samReaders);
        CloserUtil.close(indexedFastaSequenceFile);
    }
}
Also used : Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) IOUtil(htsjdk.samtools.util.IOUtil) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) CigarElement(htsjdk.samtools.CigarElement) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) CigarOperator(htsjdk.samtools.CigarOperator) SAMRecordPartition(com.github.lindenb.jvarkit.util.samtools.SAMRecordPartition) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) SortOrder(htsjdk.samtools.SAMFileHeader.SortOrder) Map(java.util.Map) Path(java.nio.file.Path) PeekableIterator(htsjdk.samtools.util.PeekableIterator) CloserUtil(htsjdk.samtools.util.CloserUtil) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) Percentile(com.github.lindenb.jvarkit.math.stats.Percentile) SAMRecord(htsjdk.samtools.SAMRecord) ReferenceSequenceFileFactory(htsjdk.samtools.reference.ReferenceSequenceFileFactory) List(java.util.List) MergingIterator(com.github.lindenb.jvarkit.util.iterator.MergingIterator) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) SamReaderFactory(htsjdk.samtools.SamReaderFactory) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) SequenceUtil(htsjdk.samtools.util.SequenceUtil) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) Function(java.util.function.Function) ValidationStringency(htsjdk.samtools.ValidationStringency) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BedLineReader(com.github.lindenb.jvarkit.bed.BedLineReader) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFConstants(htsjdk.variant.vcf.VCFConstants) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) FilterIterator(com.github.lindenb.jvarkit.util.iterator.FilterIterator) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IntervalTree(htsjdk.samtools.util.IntervalTree) SamReader(htsjdk.samtools.SamReader) File(java.io.File) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) SamRecordJEXLFilter(com.github.lindenb.jvarkit.util.samtools.SamRecordJEXLFilter) QueryInterval(htsjdk.samtools.QueryInterval) SAMRecordCoordinateComparator(htsjdk.samtools.SAMRecordCoordinateComparator) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Collections(java.util.Collections) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ArrayList(java.util.ArrayList) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) CigarOperator(htsjdk.samtools.CigarOperator) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) CigarElement(htsjdk.samtools.CigarElement) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) File(java.io.File) QueryInterval(htsjdk.samtools.QueryInterval) BedLineReader(com.github.lindenb.jvarkit.bed.BedLineReader) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SamReader(htsjdk.samtools.SamReader) SAMRecordCoordinateComparator(htsjdk.samtools.SAMRecordCoordinateComparator) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) SamReaderFactory(htsjdk.samtools.SamReaderFactory) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) Allele(htsjdk.variant.variantcontext.Allele) Cigar(htsjdk.samtools.Cigar) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) PeekableIterator(htsjdk.samtools.util.PeekableIterator)

Example 59 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class MakeMiniBam method doWork.

@Override
public int doWork(final List<String> args) {
    ArchiveFactory archive = null;
    int id_generator = 0;
    final Set<String> outputFileNames = new HashSet<>();
    try {
        if (StringUtils.isBlank(this.filePrefix) || this.filePrefix.equals("now")) {
            final SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyyMMdd");
            this.filePrefix = simpleDateFormat.format(new Date()) + ".";
        }
        if (!this.filePrefix.endsWith(".")) {
            this.filePrefix += ".";
        }
        IOUtil.assertDirectoryIsWritable(tmpDir);
        final List<Path> bamFiles = IOUtils.unrollPaths(args);
        if (bamFiles.isEmpty()) {
            LOG.error("no bam file defined");
            return -1;
        }
        final List<Locatable> locatables = this.intervalListProvider.enableBreakEndInterval(!disable_sv_bnd).enableSinglePoint().stream().collect(Collectors.toList());
        if (locatables.isEmpty()) {
            LOG.error("no position defined");
            return -1;
        }
        final SAMFileWriterFactory swf = new SAMFileWriterFactory();
        swf.setCompressionLevel(9);
        swf.setCreateIndex(true);
        final SamReaderFactory srf = super.createSamReaderFactory();
        if (this.referencePath != null)
            srf.referenceSequence(this.referencePath);
        archive = ArchiveFactory.open(this.outputFile);
        archive.setCompressionLevel(Deflater.NO_COMPRESSION);
        for (final Path bamFile : bamFiles) {
            LOG.info(bamFile.toString());
            final StopWatch stopWatch = new StopWatch();
            stopWatch.start();
            final SamReader sr = srf.open(bamFile);
            if (!sr.hasIndex()) {
                sr.close();
                LOG.error("file " + bamFile + " is not indexed.");
                return -1;
            }
            final SAMFileHeader header = sr.getFileHeader();
            if (!header.getSortOrder().equals(SortOrder.coordinate)) {
                sr.close();
                LOG.error("file " + bamFile + " is not sorted on coordinate.");
                return -1;
            }
            final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
            final Optional<String> dictLabel = SequenceDictionaryUtils.getBuildName(dict);
            final String labelSuffix = (dictLabel.isPresent() ? "." + dictLabel.get() : "") + (locatables.size() == 1 ? "." + locatables.get(0).getContig() + "_" + locatables.get(0).getStart() + (locatables.get(0).getStart() == locatables.get(0).getEnd() ? "" : "_" + locatables.get(0).getEnd()) : "");
            final ContigNameConverter ctgConvert = ContigNameConverter.fromOneDictionary(dict);
            final QueryInterval[] array = locatables.stream().flatMap(loc -> {
                if (this.bound_edge < 1 || loc.getLengthOnReference() <= this.bound_edge) {
                    return Collections.singletonList(loc).stream();
                }
                return Arrays.asList((Locatable) new SimpleInterval(loc.getContig(), loc.getStart(), loc.getStart()), (Locatable) new SimpleInterval(loc.getContig(), loc.getEnd(), loc.getEnd())).stream();
            }).map(LOC -> {
                final String contig = ctgConvert.apply(LOC.getContig());
                if (StringUtils.isBlank(contig)) {
                    LOG.warn("Cannot find " + LOC.getContig() + " in " + bamFile);
                    return null;
                }
                final SAMSequenceRecord ssr = dict.getSequence(contig);
                if (LOC.getStart() > ssr.getSequenceLength()) {
                    LOG.warn("pos " + LOC + " is greater than chromosome size " + ssr.getSequenceLength() + " in " + bamFile);
                    return null;
                }
                return new QueryInterval(ssr.getSequenceIndex(), Math.max(1, LOC.getStart() - extend), Math.min(ssr.getSequenceLength(), LOC.getEnd() + extend));
            }).filter(Q -> Q != null).collect(HtsCollectors.optimizedQueryIntervals());
            final Path tmpBam = Files.createTempFile(this.tmpDir, "tmp.", ".bam");
            final SAMFileHeader header2 = header.clone();
            final SAMProgramRecord prg = header2.createProgramRecord();
            prg.setProgramName(this.getProgramName());
            prg.setProgramVersion(this.getGitHash());
            prg.setCommandLine(this.getProgramCommandLine());
            JVarkitVersion.getInstance().addMetaData(this, header2);
            header2.addComment("MiniBam : Bam was " + bamFile);
            try (SAMFileWriter sfw = swf.makeBAMWriter(header2, true, tmpBam)) {
                if (array.length > 0) {
                    try (CloseableIterator<SAMRecord> ssr = sr.query(array, false)) {
                        while (ssr.hasNext()) {
                            final SAMRecord rec = ssr.next();
                            if (this.samRecordFilter.filterOut(rec))
                                continue;
                            rec.setAttribute(SAMTag.PG.name(), prg.getId());
                            sfw.addAlignment(rec);
                        }
                    }
                }
            }
            final Path tmpBai = SamFiles.findIndex(tmpBam);
            if (!Files.exists(tmpBai)) {
                LOG.error("Cannot find tmp bai Index for " + bamFile + " " + tmpBam);
                return -1;
            }
            final String sampleName1 = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse(null);
            final String sampleName;
            if (!StringUtils.isBlank(sampleName1)) {
                sampleName = sampleName1;
            } else if (this.allow_no_sample) {
                sampleName = IOUtils.getFilenameWithoutSuffix(bamFile, 1);
                LOG.warn("No Read group in " + bamFile + " using filename : " + sampleName);
            } else {
                throw new IllegalArgumentException("No Sample found in " + bamFile + ". Use --no-samples option ?");
            }
            String filename = this.filePrefix + sampleName + labelSuffix;
            while (outputFileNames.contains(filename)) {
                filename = this.filePrefix + sampleName + "." + (id_generator++) + labelSuffix;
            }
            outputFileNames.add(filename);
            archive.copyTo(tmpBam, filename + FileExtensions.BAM);
            archive.copyTo(tmpBai, filename + IOUtils.getFileSuffix(tmpBai));
            stopWatch.stop();
            LOG.info("Added " + StringUtils.niceFileSize(Files.size(tmpBam)) + "(bam) and " + StringUtils.niceFileSize(Files.size(tmpBai)) + " (Bai). " + StringUtils.niceDuration(stopWatch.getElapsedTime()));
            Files.deleteIfExists(tmpBam);
            Files.deleteIfExists(tmpBai);
        }
        if (!StringUtils.isBlank(this.userComment)) {
            try (final PrintWriter pw = archive.openWriter(this.filePrefix + (this.filePrefix.endsWith(".") ? "" : ".") + "README.md")) {
                pw.append(this.userComment);
                pw.println();
                pw.println("## BAMS");
                pw.println();
                for (final Path bamFile : bamFiles) pw.println("  * " + bamFile);
                pw.println();
                pw.println("## Date");
                pw.println();
                pw.println(new SimpleDateFormat("yyyyMMdd").format(new Date()));
                pw.flush();
            }
        }
        archive.close();
        archive = null;
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(archive);
    }
}
Also used : Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) IOUtil(htsjdk.samtools.util.IOUtil) Date(java.util.Date) SamFiles(htsjdk.samtools.SamFiles) IntervalListProvider(com.github.lindenb.jvarkit.samtools.util.IntervalListProvider) SAMFileHeader(htsjdk.samtools.SAMFileHeader) SortOrder(htsjdk.samtools.SAMFileHeader.SortOrder) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) SAMFileWriter(htsjdk.samtools.SAMFileWriter) Deflater(java.util.zip.Deflater) Collectors(java.util.stream.Collectors) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) SAMProgramRecord(htsjdk.samtools.SAMProgramRecord) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) FileExtensions(htsjdk.samtools.util.FileExtensions) Optional(java.util.Optional) SamReaderFactory(htsjdk.samtools.SamReaderFactory) CloseableIterator(htsjdk.samtools.util.CloseableIterator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) SimpleDateFormat(java.text.SimpleDateFormat) HtsCollectors(com.github.lindenb.jvarkit.stream.HtsCollectors) HashSet(java.util.HashSet) SAMTag(htsjdk.samtools.SAMTag) StopWatch(htsjdk.samtools.util.StopWatch) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Locatable(htsjdk.samtools.util.Locatable) SAMFileWriterFactory(htsjdk.samtools.SAMFileWriterFactory) Files(java.nio.file.Files) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) SamReader(htsjdk.samtools.SamReader) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) QueryInterval(htsjdk.samtools.QueryInterval) SamRecordFilterFactory(com.github.lindenb.jvarkit.util.bio.samfilter.SamRecordFilterFactory) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Collections(java.util.Collections) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) QueryInterval(htsjdk.samtools.QueryInterval) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SAMProgramRecord(htsjdk.samtools.SAMProgramRecord) SamReader(htsjdk.samtools.SamReader) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) HashSet(java.util.HashSet) PrintWriter(java.io.PrintWriter) Path(java.nio.file.Path) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) SamReaderFactory(htsjdk.samtools.SamReaderFactory) SAMFileWriter(htsjdk.samtools.SAMFileWriter) SAMFileWriterFactory(htsjdk.samtools.SAMFileWriterFactory) Date(java.util.Date) StopWatch(htsjdk.samtools.util.StopWatch) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) SimpleDateFormat(java.text.SimpleDateFormat) Locatable(htsjdk.samtools.util.Locatable)

Example 60 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class HmmMergeBed method doWork.

@Override
public int doWork(final List<String> args) {
    PrintWriter pw = null;
    try {
        if (this.numTreshold < 1) {
            LOG.error("bad treshold.");
            return -1;
        }
        final Comparator<Base> locCompare;
        if (this.dictPath != null) {
            final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.dictPath);
            locCompare = new ContigDictComparator(dict).createLocatableComparator();
        } else {
            locCompare = (A, B) -> {
                int i = A.getContig().compareTo(B.getContig());
                if (i != 0)
                    return i;
                i = Integer.compare(A.getStart(), B.getStart());
                if (i != 0)
                    return i;
                return Integer.compare(A.getEnd(), B.getEnd());
            };
        }
        final List<Path> paths = IOUtils.unrollPaths(args);
        if (this.numTreshold > paths.size()) {
            LOG.error("bad treshold (> number of files).");
            return -1;
        }
        if (paths.isEmpty()) {
            LOG.error("empty input");
            return -1;
        }
        final List<CloseableIterator<Base>> baseiterators = new ArrayList<>(paths.size());
        for (final Path bedPath : paths) {
            final BedLineIterator iter1 = new BedLineIterator(bedPath);
            final BedLineToBaseIterator iter2 = new BedLineToBaseIterator(iter1);
            baseiterators.add(iter2);
        }
        pw = super.openPathOrStdoutAsPrintWriter(this.outputFile);
        final BaseMergerIterator bmiter = new BaseMergerIterator(baseiterators, locCompare);
        while (bmiter.hasNext()) {
            final BaseMerge bm = bmiter.next();
            if (!bm.isValid()) {
                if (this.hide_invalid)
                    continue;
                pw.print("#");
            }
            pw.print(bm.contig);
            pw.print('\t');
            pw.print(bm.start - 1);
            pw.print('\t');
            pw.print(bm.end);
            pw.print('\t');
            pw.print(bm.getOpcode());
            pw.println();
        }
        bmiter.close();
        pw.flush();
        pw.close();
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    }
}
Also used : Path(java.nio.file.Path) CloseableIterator(htsjdk.samtools.util.CloseableIterator) AbstractCloseableIterator(com.github.lindenb.jvarkit.iterator.AbstractCloseableIterator) ArrayList(java.util.ArrayList) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) PrintWriter(java.io.PrintWriter)

Aggregations

CloseableIterator (htsjdk.samtools.util.CloseableIterator)103 List (java.util.List)86 Logger (com.github.lindenb.jvarkit.util.log.Logger)85 Parameter (com.beust.jcommander.Parameter)82 Program (com.github.lindenb.jvarkit.util.jcommander.Program)78 ArrayList (java.util.ArrayList)73 Collectors (java.util.stream.Collectors)71 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)69 Path (java.nio.file.Path)69 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)66 CloserUtil (htsjdk.samtools.util.CloserUtil)64 Set (java.util.Set)64 VCFHeader (htsjdk.variant.vcf.VCFHeader)59 VariantContext (htsjdk.variant.variantcontext.VariantContext)54 IOException (java.io.IOException)53 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)51 SAMRecord (htsjdk.samtools.SAMRecord)51 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)51 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)50 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)49