Search in sources :

Example 81 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class GroupByGene method read.

private void read(final String input) throws IOException {
    SortingCollection<Call> sortingCollection = null;
    VCFIterator vcfIterator = null;
    try {
        final BcfIteratorBuilder iterbuilder = new BcfIteratorBuilder();
        vcfIterator = (input == null ? iterbuilder.open(stdin()) : iterbuilder.open(input));
        final VCFHeader header = vcfIterator.getHeader();
        this.contigDictComparator = new ContigDictComparator(SequenceDictionaryUtils.extractRequired(header));
        sortingCollection = SortingCollection.newInstance(Call.class, new CallCodec(header), (C1, C2) -> C1.compare2(C2), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        sortingCollection.setDestructiveIteration(true);
        final GeneExtractorFactory geneExtractorFactory = new GeneExtractorFactory(header);
        final List<GeneExtractorFactory.GeneExtractor> geneExtractors = geneExtractorFactory.parse(this.extractorsNames);
        final List<String> sampleNames;
        if (header.getSampleNamesInOrder() != null) {
            sampleNames = header.getSampleNamesInOrder();
        } else {
            sampleNames = Collections.emptyList();
        }
        final Pedigree pedigree;
        if (this.pedigreePath != null) {
            final PedigreeParser pedParser = new PedigreeParser();
            pedigree = pedParser.parse(this.pedigreePath);
        } else {
            pedigree = PedigreeParser.empty();
        }
        final ProgressFactory.Watcher<VariantContext> progress = ProgressFactory.newInstance().dictionary(header).logger(LOG).build();
        while (vcfIterator.hasNext()) {
            final VariantContext ctx = progress.apply(vcfIterator.next());
            if (!ctx.isVariant())
                continue;
            if (ignore_filtered && ctx.isFiltered())
                continue;
            // simplify line
            final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
            vcb.noID();
            vcb.log10PError(VariantContext.NO_LOG10_PERROR);
            vcb.unfiltered();
            vcb.attributes(Collections.emptyMap());
            final VariantContext ctx2 = vcb.make();
            final SortingCollection<Call> finalSorter = sortingCollection;
            geneExtractors.stream().flatMap(EX -> EX.apply(ctx).keySet().stream()).forEach(KG -> {
                final Call c = new Call();
                c.ctx = ctx2;
                c.gene = new GeneName(KG.getKey(), KG.getGene(), KG.getMethod());
                finalSorter.add(c);
            });
        }
        CloserUtil.close(vcfIterator);
        vcfIterator = null;
        sortingCollection.doneAdding();
        progress.close();
        /**
         * dump
         */
        final Set<String> casesSamples = pedigree.getAffectedSamples().stream().map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
        final Set<String> controlsSamples = pedigree.getUnaffectedSamples().stream().map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
        final Set<String> maleSamples = pedigree.getSamples().stream().filter(P -> P.isMale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
        final Set<String> femaleSamples = pedigree.getSamples().stream().filter(P -> P.isFemale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
        final Predicate<Genotype> genotypeFilter = genotype -> {
            if (!genotype.isAvailable())
                return false;
            if (!genotype.isCalled())
                return false;
            if (genotype.isNoCall())
                return false;
            if (genotype.isHomRef())
                return false;
            if (this.ignore_filtered_genotype && genotype.isFiltered())
                return false;
            return true;
        };
        PrintStream pw = openPathOrStdoutAsPrintStream(this.outFile);
        pw.print("#chrom");
        pw.print('\t');
        pw.print("min.POS");
        pw.print('\t');
        pw.print("max.POS");
        pw.print('\t');
        pw.print("gene.name");
        pw.print('\t');
        pw.print("gene.label");
        pw.print('\t');
        pw.print("gene.type");
        pw.print('\t');
        pw.print("samples.affected");
        pw.print('\t');
        pw.print("count.variations");
        if (this.print_positions) {
            pw.print('\t');
            pw.print("positions");
        }
        if (!casesSamples.isEmpty()) {
            pw.print('\t');
            pw.print("pedigree.cases");
        }
        if (!controlsSamples.isEmpty()) {
            pw.print('\t');
            pw.print("pedigree.controls");
        }
        if (!maleSamples.isEmpty()) {
            pw.print('\t');
            pw.print("pedigree.males");
        }
        if (!femaleSamples.isEmpty()) {
            pw.print('\t');
            pw.print("pedigree.females");
        }
        if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
            pw.print('\t');
            pw.print("fisher");
        }
        for (final String sample : sampleNames) {
            pw.print('\t');
            pw.print(sample);
        }
        pw.println();
        final CloseableIterator<Call> iter = sortingCollection.iterator();
        final EqualRangeIterator<Call> eqiter = new EqualRangeIterator<>(iter, (C1, C2) -> C1.compareTo(C2));
        while (eqiter.hasNext()) {
            final List<Call> row = eqiter.next();
            final Call first = row.get(0);
            final List<VariantContext> variantList = row.stream().map(R -> R.ctx).collect(Collectors.toList());
            final int minPos = variantList.stream().mapToInt(R -> R.getStart()).min().getAsInt();
            final int maxPos = variantList.stream().mapToInt(R -> R.getEnd()).max().getAsInt();
            final Set<String> sampleCarryingMut = new HashSet<String>();
            final Counter<String> pedCasesCarryingMut = new Counter<String>();
            final Counter<String> pedCtrlsCarryingMut = new Counter<String>();
            final Counter<String> malesCarryingMut = new Counter<String>();
            final Counter<String> femalesCarryingMut = new Counter<String>();
            final Counter<String> sample2count = new Counter<String>();
            for (final VariantContext ctx : variantList) {
                for (final Genotype genotype : ctx.getGenotypes()) {
                    if (!genotypeFilter.test(genotype))
                        continue;
                    final String sampleName = genotype.getSampleName();
                    sample2count.incr(sampleName);
                    sampleCarryingMut.add(sampleName);
                    if (casesSamples.contains(sampleName)) {
                        pedCasesCarryingMut.incr(sampleName);
                    }
                    if (controlsSamples.contains(sampleName)) {
                        pedCtrlsCarryingMut.incr(sampleName);
                    }
                    if (maleSamples.contains(sampleName)) {
                        malesCarryingMut.incr(sampleName);
                    }
                    if (femaleSamples.contains(sampleName)) {
                        femalesCarryingMut.incr(sampleName);
                    }
                }
            }
            pw.print(first.getContig());
            pw.print('\t');
            // convert to bed
            pw.print(minPos - 1);
            pw.print('\t');
            pw.print(maxPos);
            pw.print('\t');
            pw.print(first.gene.name);
            pw.print('\t');
            pw.print(first.gene.label);
            pw.print('\t');
            pw.print(first.gene.type);
            pw.print('\t');
            pw.print(sampleCarryingMut.size());
            pw.print('\t');
            pw.print(variantList.size());
            if (this.print_positions) {
                pw.print('\t');
                pw.print(variantList.stream().map(CTX -> String.valueOf(CTX.getStart()) + ":" + CTX.getAlleles().stream().map(A -> A.getDisplayString()).collect(Collectors.joining("/"))).collect(Collectors.joining(";")));
            }
            if (!casesSamples.isEmpty()) {
                pw.print('\t');
                pw.print(pedCasesCarryingMut.getCountCategories());
            }
            if (!controlsSamples.isEmpty()) {
                pw.print('\t');
                pw.print(pedCtrlsCarryingMut.getCountCategories());
            }
            if (!maleSamples.isEmpty()) {
                pw.print('\t');
                pw.print(malesCarryingMut.getCountCategories());
            }
            if (!femaleSamples.isEmpty()) {
                pw.print('\t');
                pw.print(femalesCarryingMut.getCountCategories());
            }
            if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
                int count_case_mut = 0;
                int count_ctrl_mut = 0;
                int count_case_wild = 0;
                int count_ctrl_wild = 0;
                for (final String sampleName : header.getSampleNamesInOrder()) {
                    final boolean has_mutation = variantList.stream().map(V -> V.getGenotype(sampleName)).anyMatch(G -> G != null && genotypeFilter.test(G));
                    if (controlsSamples.contains(sampleName)) {
                        if (has_mutation) {
                            count_ctrl_mut++;
                        } else {
                            count_ctrl_wild++;
                        }
                    } else if (casesSamples.contains(sampleName)) {
                        if (has_mutation) {
                            count_case_mut++;
                        } else {
                            count_case_wild++;
                        }
                    }
                }
                final FisherExactTest fisher = FisherExactTest.compute(count_case_mut, count_case_wild, count_ctrl_mut, count_ctrl_wild);
                pw.print('\t');
                pw.print(fisher.getAsDouble());
            }
            for (final String sample : sampleNames) {
                pw.print('\t');
                pw.print(sample2count.count(sample));
            }
            pw.println();
            if (pw.checkError())
                break;
        }
        eqiter.close();
        iter.close();
        pw.flush();
        if (this.outFile != null)
            pw.close();
    } finally {
        CloserUtil.close(vcfIterator);
        if (sortingCollection != null)
            sortingCollection.cleanup();
    }
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) DataInputStream(java.io.DataInputStream) CloseableIterator(htsjdk.samtools.util.CloseableIterator) VCFIterator(htsjdk.variant.vcf.VCFIterator) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) GeneExtractorFactory(com.github.lindenb.jvarkit.util.vcf.predictions.GeneExtractorFactory) VCFHeader(htsjdk.variant.vcf.VCFHeader) VCFEncoder(htsjdk.variant.vcf.VCFEncoder) VCFHeaderVersion(htsjdk.variant.vcf.VCFHeaderVersion) ParametersDelegate(com.beust.jcommander.ParametersDelegate) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) DataOutputStream(java.io.DataOutputStream) FisherExactTest(com.github.lindenb.jvarkit.math.stats.FisherExactTest) PedigreeParser(com.github.lindenb.jvarkit.pedigree.PedigreeParser) AbstractDataCodec(com.github.lindenb.jvarkit.util.picard.AbstractDataCodec) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFCodec(htsjdk.variant.vcf.VCFCodec) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintStream(java.io.PrintStream) SortingCollection(htsjdk.samtools.util.SortingCollection) Counter(com.github.lindenb.jvarkit.util.Counter) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) Pedigree(com.github.lindenb.jvarkit.pedigree.Pedigree) Predicate(java.util.function.Predicate) Logger(com.github.lindenb.jvarkit.util.log.Logger) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) Set(java.util.Set) IOException(java.io.IOException) EOFException(java.io.EOFException) Collectors(java.util.stream.Collectors) List(java.util.List) BcfIteratorBuilder(com.github.lindenb.jvarkit.variant.vcf.BcfIteratorBuilder) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VariantContext(htsjdk.variant.variantcontext.VariantContext) Collections(java.util.Collections) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) VariantContext(htsjdk.variant.variantcontext.VariantContext) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) BcfIteratorBuilder(com.github.lindenb.jvarkit.variant.vcf.BcfIteratorBuilder) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) PedigreeParser(com.github.lindenb.jvarkit.pedigree.PedigreeParser) Counter(com.github.lindenb.jvarkit.util.Counter) VCFHeader(htsjdk.variant.vcf.VCFHeader) VCFIterator(htsjdk.variant.vcf.VCFIterator) HashSet(java.util.HashSet) PrintStream(java.io.PrintStream) Genotype(htsjdk.variant.variantcontext.Genotype) FisherExactTest(com.github.lindenb.jvarkit.math.stats.FisherExactTest) GeneExtractorFactory(com.github.lindenb.jvarkit.util.vcf.predictions.GeneExtractorFactory) Pedigree(com.github.lindenb.jvarkit.pedigree.Pedigree) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder)

Example 82 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class ExpansionHunterMerge method doWork.

@Override
public int doWork(List<String> args) {
    SortingCollection<VariantContext> sorter = null;
    try {
        final List<Path> inputs = IOUtils.unrollPaths(args);
        if (inputs.isEmpty()) {
            LOG.info("no input file.");
            return -1;
        }
        final String REPID = "REPID";
        final String RU = "RU";
        final String fakeSample = "___FAKE";
        SAMSequenceDictionary dict = null;
        final Set<String> samples = new TreeSet<>();
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        for (final Path path : inputs) {
            try (VCFReader r = VCFReaderFactory.makeDefault().open(path, false)) {
                final VCFHeader header = r.getHeader();
                if (header.getInfoHeaderLine(REPID) == null) {
                    LOG.error("missing INFO/" + REPID);
                    return -1;
                }
                final SAMSequenceDictionary d = header.getSequenceDictionary();
                if (d != null) {
                    if (dict == null) {
                        dict = d;
                    } else {
                        SequenceUtil.assertSequenceDictionariesEqual(d, dict);
                    }
                }
                if (header.getNGenotypeSamples() != 1) {
                    LOG.error("expected one and only one genotyped sample in " + path);
                    return -1;
                }
                final String sn = header.getSampleNamesInOrder().get(0);
                if (sn.equals(fakeSample)) {
                    LOG.error(sn + " cannot be named " + fakeSample + " in " + path);
                    return -1;
                }
                if (samples.contains(sn)) {
                    LOG.error("duplicate sample " + sn + " in " + path);
                    return -1;
                }
                metaData.addAll(header.getMetaDataInInputOrder());
                samples.add(sn);
            }
        }
        final VCFHeader tmpHeader = new VCFHeader(metaData, Arrays.asList(fakeSample));
        final VCFInfoHeaderLine sampleInfo = new VCFInfoHeaderLine("SRCSAMPLE", 1, VCFHeaderLineType.String, "SRC SAMPLE");
        final Comparator<String> ctgComparator = dict == null ? (A, B) -> A.compareTo(B) : new ContigDictComparator(dict);
        final BiFunction<VariantContext, String, String> getAtt = (V, A) -> {
            final String s1 = V.getAttributeAsString(A, "");
            if (StringUtils.isBlank(s1))
                throw new IllegalStateException("INFO/" + A + " missing in " + V);
            return s1;
        };
        final Comparator<VariantContext> comparator1 = (V1, V2) -> {
            String s1 = V1.getContig();
            String s2 = V2.getContig();
            int i = ctgComparator.compare(s1, s2);
            if (i != 0)
                return i;
            i = Integer.compare(V1.getStart(), V2.getStart());
            if (i != 0)
                return i;
            s1 = getAtt.apply(V1, REPID);
            s2 = getAtt.apply(V2, REPID);
            return s1.compareTo(s2);
        };
        final Comparator<VariantContext> comparator2 = (V1, V2) -> {
            final int i = comparator1.compare(V1, V2);
            if (i != 0)
                return i;
            final String s1 = getAtt.apply(V1, sampleInfo.getID());
            final String s2 = getAtt.apply(V2, sampleInfo.getID());
            return s1.compareTo(s2);
        };
        tmpHeader.addMetaDataLine(sampleInfo);
        sorter = SortingCollection.newInstance(VariantContext.class, new VCFRecordCodec(tmpHeader, false), comparator2, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        sorter.setDestructiveIteration(true);
        for (final Path path : inputs) {
            LOG.info("adding " + path);
            try (VCFReader r = VCFReaderFactory.makeDefault().open(path, false)) {
                try (CloseableIterator<VariantContext> iter = r.iterator()) {
                    while (iter.hasNext()) {
                        final VariantContext ctx = iter.next();
                        final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
                        final Genotype gt0 = ctx.getGenotype(0);
                        final Genotype gt = new GenotypeBuilder(gt0).name(fakeSample).make();
                        vcb.genotypes(Collections.singletonList(gt));
                        vcb.attribute(sampleInfo.getID(), gt0.getSampleName());
                        sorter.add(vcb.make());
                    }
                }
            }
        }
        sorter.doneAdding();
        final VCFHeader outputHeader = new VCFHeader(metaData, samples);
        if (dict != null)
            outputHeader.setSequenceDictionary(dict);
        JVarkitVersion.getInstance().addMetaData(this, outputHeader);
        try (VariantContextWriter w = writingVariants.dictionary(dict).open(this.outputFile)) {
            w.writeHeader(outputHeader);
            try (CloseableIterator<VariantContext> iter = sorter.iterator()) {
                final EqualRangeIterator<VariantContext> eq = new EqualRangeIterator<>(iter, comparator1);
                while (eq.hasNext()) {
                    final List<VariantContext> calls = eq.next();
                    final VariantContext first = calls.get(0);
                    final Set<Allele> altAllelesSet = calls.stream().flatMap(V -> V.getGenotypes().stream()).flatMap(GT -> GT.getAlleles().stream()).filter(A -> !(A.isReference() || A.isNoCall())).collect(Collectors.toSet());
                    if (altAllelesSet.isEmpty())
                        continue;
                    final List<Allele> altAllelesList = new ArrayList<>(altAllelesSet);
                    final List<Allele> vcAlleles = new ArrayList<>(altAllelesList.size() + 1);
                    vcAlleles.add(first.getReference());
                    vcAlleles.addAll(altAllelesList);
                    final List<Genotype> genotypes = new ArrayList<>(samples.size());
                    for (final String sn : samples) {
                        final VariantContext vcs = calls.stream().filter(V -> getAtt.apply(V, sampleInfo.getID()).equals(sn)).findFirst().orElse(null);
                        final Genotype gt;
                        if (vcs == null) {
                            gt = GenotypeBuilder.createMissing(sn, 2);
                        } else {
                            gt = new GenotypeBuilder(vcs.getGenotype(0)).name(sn).make();
                        }
                        genotypes.add(gt);
                    }
                    final VariantContextBuilder vcb = new VariantContextBuilder(null, first.getContig(), first.getStart(), first.getEnd(), vcAlleles);
                    vcb.attribute(VCFConstants.END_KEY, first.getEnd());
                    vcb.attribute(REPID, getAtt.apply(first, REPID));
                    vcb.attribute(RU, getAtt.apply(first, RU));
                    vcb.genotypes(genotypes);
                    w.add(vcb.make());
                }
                eq.close();
            }
        }
        sorter.cleanup();
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
    }
}
Also used : WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) VCFRecordCodec(htsjdk.variant.vcf.VCFRecordCodec) Arrays(java.util.Arrays) SequenceUtil(htsjdk.samtools.util.SequenceUtil) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) BiFunction(java.util.function.BiFunction) TreeSet(java.util.TreeSet) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) Path(java.nio.file.Path) VCFConstants(htsjdk.variant.vcf.VCFConstants) SortingCollection(htsjdk.samtools.util.SortingCollection) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFReader(htsjdk.variant.vcf.VCFReader) Set(java.util.Set) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) Collectors(java.util.stream.Collectors) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) Comparator(java.util.Comparator) Collections(java.util.Collections) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) VCFRecordCodec(htsjdk.variant.vcf.VCFRecordCodec) TreeSet(java.util.TreeSet) VCFReader(htsjdk.variant.vcf.VCFReader) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) Path(java.nio.file.Path) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder)

Example 83 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class DepthOfCoverage method doWork.

@Override
public int doWork(final List<String> args) {
    PrintWriter out = null;
    if (this.auto_mask && this.faidx == null) {
        LOG.error("Cannot auto mask if REF is not defined");
        return -1;
    }
    if (this.maskBed != null && this.includeBed != null) {
        LOG.error("both --mask and --bed both defined");
        return -1;
    }
    ReferenceSequenceFile referenceSequenceFile = null;
    try {
        final Predicate<String> isRejectContigRegex;
        if (!StringUtils.isBlank(this.skipContigExpr)) {
            final Pattern pat = Pattern.compile(this.skipContigExpr);
            isRejectContigRegex = S -> pat.matcher(S).matches();
        } else {
            isRejectContigRegex = S -> false;
        }
        final SamReaderFactory srf = super.createSamReaderFactory();
        if (this.faidx != null) {
            srf.referenceSequence(this.faidx);
            srf.setUseAsyncIo(this.asyncIo);
            referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx);
        }
        out = super.openPathOrStdoutAsPrintWriter(this.outputFile);
        out.print("#BAM\tSample\tContig\tContig-Length\tMasked-Contig-Length\tCount\tDepth\tMedian\tMin\tMax\tMaxPos");
        for (RangeOfIntegers.Range r : this.summaryCov.getRanges()) {
            if (r.getMinInclusive() == null)
                continue;
            out.print("\t");
            out.print(r.toString());
        }
        out.println();
        for (final Path path : IOUtils.unrollPaths(args)) {
            try (final SamReader sr = srf.open(path)) {
                if (!sr.hasIndex()) {
                    LOG.error("File " + path + " is not indexed.");
                    return -1;
                }
                final SAMFileHeader header = sr.getFileHeader();
                final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
                final Set<String> rejectContigSet = dict.getSequences().stream().map(SSR -> SSR.getSequenceName()).filter(isRejectContigRegex).collect(Collectors.toCollection(HashSet::new));
                rejectContigSet.addAll(dict.getSequences().stream().filter(SSR -> SSR.getSequenceLength() < this.skipContigLength).map(SSR -> SSR.getSequenceName()).collect(Collectors.toCollection(HashSet::new)));
                if (!header.getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)) {
                    LOG.error("file is not sorted on coordinate :" + header.getSortOrder() + " " + path);
                    return -1;
                }
                final QueryInterval[] intervals;
                if (this.useBamIndexFlag && this.includeBed != null) {
                    if (!sr.hasIndex()) {
                        LOG.error("Bam is not indexed. " + path);
                        return -1;
                    }
                    final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
                    final List<QueryInterval> L = new ArrayList<>();
                    try (BedLineReader br = new BedLineReader(this.includeBed)) {
                        while (br.hasNext()) {
                            final BedLine bed = br.next();
                            final String ctg = contigNameConverter.apply(bed.getContig());
                            if (StringUtils.isBlank(ctg))
                                continue;
                            final int tid = dict.getSequenceIndex(ctg);
                            if (tid < 0)
                                continue;
                            L.add(new QueryInterval(tid, bed.getStart(), bed.getEnd()));
                        }
                    }
                    intervals = QueryInterval.optimizeIntervals(L.toArray(new QueryInterval[L.size()]));
                } else {
                    intervals = null;
                }
                Integer minCov = null;
                Integer maxCov = null;
                ContigPos maxCovPosition = null;
                long count_raw_bases = 0L;
                long count_bases = 0L;
                long sum_coverage = 0L;
                final DiscreteMedian<Integer> discreteMedian_wg = new DiscreteMedian<>();
                final Counter<RangeOfIntegers.Range> countMap_wg = new Counter<>();
                final String sample = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse(path.toString());
                int[] coverage = null;
                String prevContig = null;
                BitSet mask = null;
                final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
                try (CloseableIterator<SAMRecord> iter = intervals == null ? sr.iterator() : sr.queryOverlapping(intervals)) {
                    for (; ; ) {
                        final SAMRecord rec = iter.hasNext() ? progress.apply(iter.next()) : null;
                        if (rec != null) {
                            if (!SAMRecordDefaultFilter.accept(rec, this.mapping_quality))
                                continue;
                            if (rejectContigSet.contains(rec.getContig()))
                                continue;
                        }
                        if (rec == null || !rec.getContig().equals(prevContig)) {
                            if (coverage != null) {
                                // DUMP
                                long count_bases_ctg = 0L;
                                long sum_coverage_ctg = 0L;
                                Integer minV_ctg = null;
                                Integer maxV_ctg = null;
                                ContigPos maxPos_ctg = null;
                                final DiscreteMedian<Integer> discreteMedian_ctg = new DiscreteMedian<>();
                                final Counter<RangeOfIntegers.Range> countMap_ctg = new Counter<>();
                                for (int i = 0; i < coverage.length; i++) {
                                    if (mask.get(i))
                                        continue;
                                    final int covi = coverage[i];
                                    if (covi > this.max_depth)
                                        continue;
                                    if (minV_ctg == null || minV_ctg.intValue() > covi)
                                        minV_ctg = covi;
                                    if (maxV_ctg == null || maxV_ctg.intValue() < covi) {
                                        maxV_ctg = covi;
                                        maxPos_ctg = new ContigPos(prevContig, i + 1);
                                    }
                                    countMap_ctg.incr(this.summaryCov.getRange(covi));
                                    count_bases_ctg++;
                                    sum_coverage_ctg += covi;
                                    discreteMedian_ctg.add(covi);
                                }
                                out.print(path);
                                out.print("\t");
                                out.print(sample);
                                out.print("\t");
                                out.print(prevContig);
                                out.print("\t");
                                out.print(coverage.length);
                                out.print("\t");
                                out.print(count_bases_ctg);
                                out.print("\t");
                                out.print(sum_coverage_ctg);
                                out.print("\t");
                                if (count_bases_ctg > 0) {
                                    out.printf("%.2f", sum_coverage_ctg / (double) count_bases_ctg);
                                } else {
                                    out.print("N/A");
                                }
                                out.print("\t");
                                final OptionalDouble median = discreteMedian_ctg.getMedian();
                                if (median.isPresent()) {
                                    out.print(median.getAsDouble());
                                } else {
                                    out.print("N/A");
                                }
                                out.print("\t");
                                if (minV_ctg != null) {
                                    out.print(minV_ctg);
                                } else {
                                    out.print("N/A");
                                }
                                out.print("\t");
                                if (maxV_ctg != null) {
                                    out.print(maxV_ctg);
                                    out.print("\t");
                                    out.print(maxPos_ctg);
                                } else {
                                    out.print("N/A\tN/A");
                                }
                                for (final RangeOfIntegers.Range r : this.summaryCov.getRanges()) {
                                    if (r.getMinInclusive() == null)
                                        continue;
                                    out.print("\t");
                                    out.print(countMap_ctg.count(r));
                                    if (!countMap_ctg.isEmpty()) {
                                        out.print(" ");
                                        out.printf("(%.2f%%)", (countMap_ctg.count(r) / (countMap_ctg.getTotal() * 1.0)) * 100.0);
                                    }
                                }
                                out.println();
                                if (minCov == null || (minV_ctg != null && minV_ctg.compareTo(minCov) < 0))
                                    minCov = minV_ctg;
                                if (maxCov == null || (maxV_ctg != null && maxV_ctg.compareTo(maxCov) > 0)) {
                                    maxCov = maxV_ctg;
                                    maxCovPosition = maxPos_ctg;
                                }
                                count_bases += count_bases_ctg;
                                sum_coverage += sum_coverage_ctg;
                                count_raw_bases += coverage.length;
                                discreteMedian_wg.add(discreteMedian_ctg);
                                countMap_wg.putAll(countMap_ctg);
                            }
                            coverage = null;
                            mask = null;
                            // /
                            System.gc();
                            if (rec == null)
                                break;
                            final SAMSequenceRecord ssr = Objects.requireNonNull(dict.getSequence(rec.getContig()));
                            coverage = new int[ssr.getSequenceLength()];
                            mask = new BitSet(ssr.getSequenceLength());
                            if (this.auto_mask && referenceSequenceFile != null) {
                                final byte[] refSeq = Objects.requireNonNull(referenceSequenceFile.getSequence(ssr.getSequenceName())).getBases();
                                for (int i = 0; i < refSeq.length; i++) {
                                    if (AcidNucleics.isATGC(refSeq[i]))
                                        continue;
                                    mask.set(i);
                                }
                            }
                            /* read mask */
                            if (this.maskBed != null) {
                                final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
                                try (BedLineReader br = new BedLineReader(this.maskBed)) {
                                    while (br.hasNext()) {
                                        final BedLine bed = br.next();
                                        if (bed == null)
                                            continue;
                                        String ctg = contigNameConverter.apply(bed.getContig());
                                        if (StringUtils.isBlank(ctg))
                                            continue;
                                        if (!rec.getContig().equals(ctg))
                                            continue;
                                        for (int p1 = bed.getStart(); p1 <= bed.getEnd() && p1 <= coverage.length; ++p1) {
                                            mask.set(p1 - 1);
                                        }
                                    }
                                }
                            } else if (this.includeBed != null) {
                                final List<Locatable> list = new ArrayList<>();
                                final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
                                try (BedLineReader br = new BedLineReader(this.includeBed)) {
                                    while (br.hasNext()) {
                                        final BedLine bed = br.next();
                                        if (bed == null)
                                            continue;
                                        final String ctg = contigNameConverter.apply(bed.getContig());
                                        if (StringUtils.isBlank(ctg))
                                            continue;
                                        if (!rec.getContig().equals(ctg))
                                            continue;
                                        list.add(new SimpleInterval(ctg, bed.getStart(), bed.getEnd()));
                                    }
                                }
                                // sort on starts
                                Collections.sort(list, (A, B) -> Integer.compare(A.getStart(), B.getStart()));
                                int p1 = 1;
                                while (p1 <= coverage.length) {
                                    while (!list.isEmpty() && list.get(0).getEnd() < p1) {
                                        list.remove(0);
                                    }
                                    if (!list.isEmpty() && list.get(0).getStart() <= p1 && p1 <= list.get(0).getEnd()) {
                                        ++p1;
                                        continue;
                                    }
                                    mask.set(p1 - 1);
                                    p1++;
                                }
                            }
                            prevContig = rec.getContig();
                        }
                        int max_end1 = coverage.length;
                        if (!this.disable_paired_overlap_flag && rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && rec.getReferenceIndex().equals(rec.getMateReferenceIndex()) && rec.getAlignmentStart() < rec.getMateAlignmentStart() && rec.getAlignmentEnd() > rec.getMateAlignmentStart()) {
                            max_end1 = rec.getMateAlignmentStart() - 1;
                        }
                        for (final AlignmentBlock block : rec.getAlignmentBlocks()) {
                            final int pos1 = block.getReferenceStart();
                            final int len = block.getLength();
                            for (int i = 0; i < len; i++) {
                                if (pos1 + i - 1 >= 0 && pos1 + i <= max_end1) {
                                    coverage[pos1 + i - 1]++;
                                }
                            }
                        }
                    }
                /* end rec */
                }
                /* end iter */
                progress.close();
                out.print(path);
                out.print("\t");
                out.print(sample);
                out.print("\t");
                out.print(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME);
                out.print("\t");
                out.print(count_raw_bases);
                out.print("\t");
                out.print(count_bases);
                out.print("\t");
                out.print(sum_coverage);
                out.print("\t");
                if (count_bases > 0) {
                    out.printf("%.2f", sum_coverage / (double) count_bases);
                } else {
                    out.print("N/A");
                }
                out.print("\t");
                final OptionalDouble median = discreteMedian_wg.getMedian();
                if (median.isPresent()) {
                    out.print(median.getAsDouble());
                } else {
                    out.print("N/A");
                }
                out.print("\t");
                if (minCov != null) {
                    out.print(minCov);
                } else {
                    out.print("N/A");
                }
                out.print("\t");
                if (maxCov != null) {
                    out.print(maxCov + "\t" + maxCovPosition);
                } else {
                    out.print("N/A\tN/A");
                }
                for (final RangeOfIntegers.Range r : this.summaryCov.getRanges()) {
                    if (r.getMinInclusive() == null)
                        continue;
                    out.print("\t");
                    out.print(countMap_wg.count(r));
                    if (!countMap_wg.isEmpty()) {
                        out.print(" ");
                        out.printf("(%.2f%%)", (countMap_wg.count(r) / (countMap_wg.getTotal() * 1.0)) * 100.0);
                    }
                }
                out.println();
            }
        }
        out.flush();
        out.close();
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(referenceSequenceFile);
    }
}
Also used : CloseableIterator(htsjdk.samtools.util.CloseableIterator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) OptionalDouble(java.util.OptionalDouble) AcidNucleics(com.github.lindenb.jvarkit.util.bio.AcidNucleics) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ArrayList(java.util.ArrayList) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) HashSet(java.util.HashSet) BedLineReader(com.github.lindenb.jvarkit.bed.BedLineReader) AlignmentBlock(htsjdk.samtools.AlignmentBlock) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) SAMRecordDefaultFilter(com.github.lindenb.jvarkit.samtools.SAMRecordDefaultFilter) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) DiscreteMedian(com.github.lindenb.jvarkit.math.DiscreteMedian) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) Counter(com.github.lindenb.jvarkit.util.Counter) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) Locatable(htsjdk.samtools.util.Locatable) Predicate(java.util.function.Predicate) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) Set(java.util.Set) SamReader(htsjdk.samtools.SamReader) Collectors(java.util.stream.Collectors) ContigPos(com.github.lindenb.jvarkit.util.vcf.ContigPos) SAMRecord(htsjdk.samtools.SAMRecord) Objects(java.util.Objects) ReferenceSequenceFileFactory(htsjdk.samtools.reference.ReferenceSequenceFileFactory) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) QueryInterval(htsjdk.samtools.QueryInterval) RangeOfIntegers(com.github.lindenb.jvarkit.math.RangeOfIntegers) BitSet(java.util.BitSet) Pattern(java.util.regex.Pattern) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Collections(java.util.Collections) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) SamReaderFactory(htsjdk.samtools.SamReaderFactory) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) ArrayList(java.util.ArrayList) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) QueryInterval(htsjdk.samtools.QueryInterval) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) BedLineReader(com.github.lindenb.jvarkit.bed.BedLineReader) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SamReader(htsjdk.samtools.SamReader) Counter(com.github.lindenb.jvarkit.util.Counter) DiscreteMedian(com.github.lindenb.jvarkit.math.DiscreteMedian) ArrayList(java.util.ArrayList) List(java.util.List) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) PrintWriter(java.io.PrintWriter) HashSet(java.util.HashSet) Path(java.nio.file.Path) Pattern(java.util.regex.Pattern) SamReaderFactory(htsjdk.samtools.SamReaderFactory) ContigPos(com.github.lindenb.jvarkit.util.vcf.ContigPos) BitSet(java.util.BitSet) OptionalDouble(java.util.OptionalDouble) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) SAMRecord(htsjdk.samtools.SAMRecord) RangeOfIntegers(com.github.lindenb.jvarkit.math.RangeOfIntegers) SAMFileHeader(htsjdk.samtools.SAMFileHeader) AlignmentBlock(htsjdk.samtools.AlignmentBlock)

Example 84 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class BuildDbsnp method doWork.

@Override
public int doWork(final List<String> args) {
    try {
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.faidx);
        if (!StringUtils.isBlank(this.limitChrom)) {
            if (dict.getSequence(this.limitChrom) == null) {
                throw new JvarkitException.ContigNotFoundInDictionary(this.limitChrom, dict);
            }
        }
        final List<VCFSource> sources = new ArrayList<>(args.size());
        for (int i = 0; i < args.size(); i++) {
            final String s = args.get(i);
            final int colon = s.indexOf(':');
            if (colon <= 0) {
                LOG.error("colon missing in " + s + " expected vcf-name:vcf-path.");
                return -1;
            }
            final String vcfName = s.substring(0, colon);
            if (sources.stream().anyMatch(S -> S.name.equals(vcfName))) {
                LOG.error("duplicate source name: \"" + vcfName + "\".");
                return -1;
            }
            final Path path = Paths.get(s.substring(colon + 1));
            IOUtil.assertFileIsReadable(path);
            sources.add(new VCFSource(vcfName, path));
        }
        final Pattern rsIdPattern = Pattern.compile("[rR][sS][0-9]+");
        try (VariantContextWriter w = writingVariantsDelegate.dictionary(dict).open(this.outputFile)) {
            final VCFHeader header = new VCFHeader();
            header.setSequenceDictionary(dict);
            JVarkitVersion.getInstance().addMetaData(this, header);
            w.writeHeader(header);
            for (final SAMSequenceRecord ssr : dict.getSequences()) {
                if (!StringUtils.isBlank(this.limitChrom) && !ssr.getContig().equals(this.limitChrom))
                    continue;
                LOG.info(ssr.getContig());
                sources.stream().forEach(SRC -> SRC.reset(ssr));
                final List<CloseableIterator<Variant>> iterators = sources.stream().map(SRC -> SRC).collect(Collectors.toList());
                final MergingIterator<Variant> merger = new MergingIterator<>((A, B) -> Integer.compare(A.pos, B.pos), iterators);
                final EqualRangeIterator<Variant> equal_range = new EqualRangeIterator<>(merger, (A, B) -> Integer.compare(A.pos, B.pos));
                while (equal_range.hasNext()) {
                    final List<Variant> variants0 = equal_range.next();
                    // all references in this set of variant
                    final Set<Allele> all_refs = variants0.stream().map(V -> V.alleles.get(0)).collect(Collectors.toCollection(TreeSet::new));
                    // loop over each ref
                    for (final Allele ref_allele : all_refs) {
                        // all variants with this ref allele
                        final List<Variant> variants = variants0.stream().filter(V -> V.alleles.get(0).equals(ref_allele)).collect(Collectors.toList());
                        final Variant variant = variants.get(0);
                        final Set<Allele> altSet = new HashSet<>();
                        variants.stream().forEach(V -> altSet.addAll(V.alleles.subList(1, V.alleles.size())));
                        altSet.remove(Allele.SPAN_DEL);
                        final List<Allele> alleles = new ArrayList<>(1 + altSet.size());
                        alleles.add(ref_allele);
                        alleles.addAll(altSet);
                        final VariantContextBuilder vcb = new VariantContextBuilder(null, ssr.getContig(), variant.pos, variant.pos + variant.alleles.get(0).length() - 1, alleles);
                        String id = variants.stream().filter(F -> // vcf spec:  Semicolon-separated list of unique identifiers where available
                        Arrays.stream(CharSplitter.SEMICOLON.split(F.id)).anyMatch(S -> rsIdPattern.matcher(S).matches())).map(F -> F.id).findFirst().orElse(null);
                        // not an rs ID
                        if (StringUtils.isBlank(id))
                            id = variants.stream().map(F -> F.id + (this.add_filter_suffix && F.filtered ? "_F" : "")).findFirst().orElse(null);
                        vcb.id(variant.id);
                        w.add(vcb.make());
                    }
                }
                equal_range.close();
                merger.close();
                iterators.stream().forEach(R -> R.close());
            }
        }
        sources.stream().forEach(SRC -> SRC.close());
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    }
}
Also used : WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) MergingIterator(htsjdk.samtools.util.MergingIterator) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) IOUtil(htsjdk.samtools.util.IOUtil) VCFHeader(htsjdk.variant.vcf.VCFHeader) TreeSet(java.util.TreeSet) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) Path(java.nio.file.Path) PeekableIterator(htsjdk.samtools.util.PeekableIterator) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFReader(htsjdk.variant.vcf.VCFReader) Set(java.util.Set) AbstractCloseableIterator(com.github.lindenb.jvarkit.iterator.AbstractCloseableIterator) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) List(java.util.List) Paths(java.nio.file.Paths) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VariantContext(htsjdk.variant.variantcontext.VariantContext) Pattern(java.util.regex.Pattern) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Collections(java.util.Collections) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) ArrayList(java.util.ArrayList) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) Path(java.nio.file.Path) Pattern(java.util.regex.Pattern) CloseableIterator(htsjdk.samtools.util.CloseableIterator) AbstractCloseableIterator(com.github.lindenb.jvarkit.iterator.AbstractCloseableIterator) MergingIterator(htsjdk.samtools.util.MergingIterator) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder)

Example 85 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class BamSliceBed method scanIterator.

@Override
protected void scanIterator(final SAMFileHeader headerIn, final CloseableIterator<SAMRecord> iter, final SAMFileWriter sfw) {
    final SAMRecordFactory samRecordFactory = new DefaultSAMRecordFactory();
    while (iter.hasNext()) {
        final SAMRecord rec = iter.next();
        if (rec.getReadUnmappedFlag())
            continue;
        final Cigar cigar = rec.getCigar();
        if (cigar == null || cigar.isEmpty())
            continue;
        final Collection<Interval> beds = bedIntervals.getOverlapping(rec);
        if (beds.isEmpty())
            continue;
        final List<Base> align = new ArrayList<>(rec.getLengthOnReference());
        int refpos = rec.getUnclippedStart();
        int readpos = 0;
        final byte[] bases = rec.getReadBases();
        if (bases == SAMRecord.NULL_SEQUENCE)
            continue;
        final String quals;
        final boolean qual_is_available;
        if (rec.getBaseQualities() == SAMRecord.NULL_QUALS) {
            quals = StringUtils.repeatCharNTimes('#', bases.length);
            qual_is_available = false;
        } else {
            quals = rec.getBaseQualityString();
            qual_is_available = true;
        }
        for (final CigarElement ce : cigar) {
            final CigarOperator op = ce.getOperator();
            switch(op) {
                case P:
                    break;
                case D:
                case N:
                    {
                        for (int i = 0; i < ce.getLength(); ++i) {
                            final Base b = new Base();
                            b.cigaroperator = op;
                            b.refpos = refpos;
                            align.add(b);
                            refpos++;
                        }
                        break;
                    }
                case H:
                    /* ignore hard clip */
                    {
                        refpos += ce.getLength();
                        break;
                    }
                case S:
                case X:
                case EQ:
                case M:
                    {
                        for (int i = 0; i < ce.getLength(); ++i) {
                            final Base b = new Base();
                            b.refpos = refpos;
                            b.cigaroperator = op;
                            b.readbase = bases[readpos];
                            b.readqual = quals.charAt(readpos);
                            b.readpos = readpos;
                            align.add(b);
                            readpos++;
                            refpos++;
                        }
                        break;
                    }
                case I:
                    {
                        for (int i = 0; i < ce.getLength(); ++i) {
                            final Base b = new Base();
                            b.cigaroperator = op;
                            b.readbase = bases[readpos];
                            b.readqual = quals.charAt(readpos);
                            b.readpos = readpos;
                            align.add(b);
                            readpos++;
                        }
                        break;
                    }
                default:
                    throw new IllegalStateException();
            }
        }
        for (final Interval bed : beds) {
            final List<Base> copy = new ArrayList<>(align);
            /* trim 5 prime */
            int trim = -1;
            for (int x = 0; x < copy.size(); x++) {
                final Base b = copy.get(x);
                if (b.cigaroperator.isAlignment() && b.refpos >= bed.getStart()) {
                    trim = x;
                    break;
                }
            }
            if (trim > 0) {
                final List<Base> subList = copy.subList(0, trim);
                if (use_clip) {
                    for (Base b : subList) {
                        if (b.readpos != -1) {
                            b.cigaroperator = CigarOperator.SOFT_CLIP;
                        }
                    }
                    // N D
                    subList.removeIf(B -> B.readpos == -1);
                } else {
                    subList.clear();
                }
            }
            /* trim 3 prime */
            trim = -1;
            for (int x = copy.size() - 1; x >= 0; x--) {
                final Base b = copy.get(x);
                if (b.cigaroperator.isAlignment() && b.refpos <= bed.getEnd()) {
                    trim = x;
                    break;
                }
            }
            if (trim != -1) {
                final List<Base> subList = copy.subList(trim + 1, copy.size());
                if (use_clip) {
                    for (Base b : subList) {
                        if (b.readpos != -1) {
                            b.cigaroperator = CigarOperator.SOFT_CLIP;
                        }
                    }
                    // N D
                    subList.removeIf(B -> B.readpos == -1);
                } else {
                    subList.clear();
                }
            }
            if (copy.stream().noneMatch(P -> P.cigaroperator.isAlignment()))
                continue;
            if (copy.isEmpty())
                continue;
            int nrefpos = copy.stream().filter(B -> B.refpos != -1).filter(B -> !B.cigaroperator.isClipping()).mapToInt(B -> B.refpos).findFirst().orElse(-1);
            final SAMRecord newrec = samRecordFactory.createSAMRecord(sfw.getFileHeader());
            newrec.setReadName(rec.getReadName() + "#" + bed.getContig() + ":" + bed.getStart() + ":" + bed.getEnd());
            newrec.setMappingQuality(SAMRecord.UNKNOWN_MAPPING_QUALITY);
            newrec.setAlignmentStart(nrefpos);
            newrec.setReadString(copy.stream().filter(B -> B.readbase != NO_BASE).map(B -> String.valueOf((char) B.readbase)).collect(Collectors.joining()));
            if (qual_is_available) {
                newrec.setBaseQualityString(copy.stream().filter(B -> B.readqual != NO_QUAL).map(B -> String.valueOf(B.readqual)).collect(Collectors.joining()));
            } else {
                newrec.setBaseQualities(SAMRecord.NULL_QUALS);
            }
            newrec.setReadUnmappedFlag(false);
            newrec.setReadNegativeStrandFlag(rec.getReadNegativeStrandFlag());
            newrec.setReferenceName(rec.getReferenceName());
            newrec.setCigar(Cigar.fromCigarOperators(copy.stream().map(B -> B.cigaroperator).collect(Collectors.toList())));
            if (rec.hasAttribute("RG")) {
                newrec.setAttribute("RG", rec.getAttribute("RG"));
            }
            for (final String att : this.keepAttributes) {
                if (rec.hasAttribute(att)) {
                    newrec.setAttribute(att, rec.getAttribute(att));
                }
            }
            newrec.setAttribute("PG", this.spr.getId());
            sfw.addAlignment(newrec);
        }
    // end for interval
    }
// end while
}
Also used : Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) IntervalListProvider(com.github.lindenb.jvarkit.samtools.util.IntervalListProvider) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) SortOrder(htsjdk.samtools.SAMFileHeader.SortOrder) DefaultSAMRecordFactory(htsjdk.samtools.DefaultSAMRecordFactory) Interval(htsjdk.samtools.util.Interval) OnePassBamLauncher(com.github.lindenb.jvarkit.jcommander.OnePassBamLauncher) SAMRecordFactory(htsjdk.samtools.SAMRecordFactory) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Collection(java.util.Collection) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Set(java.util.Set) SAMFileWriter(htsjdk.samtools.SAMFileWriter) Collectors(java.util.stream.Collectors) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) SAMProgramRecord(htsjdk.samtools.SAMProgramRecord) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) ArrayList(java.util.ArrayList) CigarOperator(htsjdk.samtools.CigarOperator) DefaultSAMRecordFactory(htsjdk.samtools.DefaultSAMRecordFactory) CigarElement(htsjdk.samtools.CigarElement) Cigar(htsjdk.samtools.Cigar) SAMRecord(htsjdk.samtools.SAMRecord) DefaultSAMRecordFactory(htsjdk.samtools.DefaultSAMRecordFactory) SAMRecordFactory(htsjdk.samtools.SAMRecordFactory) Interval(htsjdk.samtools.util.Interval)

Aggregations

CloseableIterator (htsjdk.samtools.util.CloseableIterator)103 List (java.util.List)86 Logger (com.github.lindenb.jvarkit.util.log.Logger)85 Parameter (com.beust.jcommander.Parameter)82 Program (com.github.lindenb.jvarkit.util.jcommander.Program)78 ArrayList (java.util.ArrayList)73 Collectors (java.util.stream.Collectors)71 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)69 Path (java.nio.file.Path)69 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)66 CloserUtil (htsjdk.samtools.util.CloserUtil)64 Set (java.util.Set)64 VCFHeader (htsjdk.variant.vcf.VCFHeader)59 VariantContext (htsjdk.variant.variantcontext.VariantContext)54 IOException (java.io.IOException)53 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)51 SAMRecord (htsjdk.samtools.SAMRecord)51 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)51 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)50 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)49