Search in sources :

Example 76 with SAMSequenceDictionaryProgress

use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.

the class GroupByGene method read.

private void read(final String input) throws IOException {
    LineIterator lineiter = null;
    SortingCollection<Call> sortingCollection = null;
    try {
        final Pattern regexType = (StringUtil.isBlank(this.typeRegexExclude) ? null : Pattern.compile(this.typeRegexExclude));
        lineiter = (input == null ? IOUtils.openStreamForLineIterator(stdin()) : IOUtils.openURIForLineIterator(input));
        sortingCollection = SortingCollection.newInstance(Call.class, new CallCodec(), (C1, C2) -> {
            int i = C1.compareTo(C2);
            if (i != 0)
                return i;
            return C1.line.compareTo(C2.line);
        }, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        sortingCollection.setDestructiveIteration(true);
        final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(lineiter);
        final VCFHeader header = cah.header;
        this.the_dictionary = header.getSequenceDictionary();
        if (this.the_dictionary == null || this.the_dictionary.isEmpty()) {
            throw new JvarkitException.DictionaryMissing(input);
        }
        this.the_codec = cah.codec;
        final List<String> sampleNames;
        if (header.getSampleNamesInOrder() != null) {
            sampleNames = header.getSampleNamesInOrder();
        } else {
            sampleNames = Collections.emptyList();
        }
        final VcfTools vcfTools = new VcfTools(header);
        final Pedigree pedigree;
        if (this.pedigreeFile != null) {
            pedigree = Pedigree.newParser().parse(this.pedigreeFile);
        } else {
            pedigree = Pedigree.newParser().parse(header);
        }
        final Pattern tab = Pattern.compile("[\t]");
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(the_dictionary).logger(LOG);
        while (lineiter.hasNext()) {
            String line = lineiter.next();
            final VariantContext ctx = progress.watch(this.the_codec.decode(line));
            if (!ctx.isVariant())
                continue;
            if (ignore_filtered && ctx.isFiltered())
                continue;
            // simplify line
            final String[] tokens = tab.split(line);
            // ID
            tokens[2] = VCFConstants.EMPTY_ID_FIELD;
            // QUAL
            tokens[5] = VCFConstants.MISSING_VALUE_v4;
            // FILTER
            tokens[6] = VCFConstants.UNFILTERED;
            // INFO
            tokens[7] = VCFConstants.EMPTY_INFO_FIELD;
            line = String.join(VCFConstants.FIELD_SEPARATOR, Arrays.asList(tokens));
            for (final GeneName g : getGenes(vcfTools, ctx)) {
                if (regexType != null && regexType.matcher(g.type).matches())
                    continue;
                final Call c = new Call();
                c.line = line;
                c.gene = g;
                sortingCollection.add(c);
            }
        }
        CloserUtil.close(lineiter);
        lineiter = null;
        sortingCollection.doneAdding();
        /**
         * dump
         */
        final Set<String> casesSamples = pedigree.getPersons().stream().filter(P -> P.isAffected()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
        final Set<String> controlsSamples = pedigree.getPersons().stream().filter(P -> P.isUnaffected()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
        final Set<String> maleSamples = pedigree.getPersons().stream().filter(P -> P.isMale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
        final Set<String> femaleSamples = pedigree.getPersons().stream().filter(P -> P.isFemale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
        final Predicate<Genotype> genotypeFilter = genotype -> {
            if (!genotype.isAvailable())
                return false;
            if (!genotype.isCalled())
                return false;
            if (genotype.isNoCall())
                return false;
            if (genotype.isHomRef())
                return false;
            if (this.ignore_filtered_genotype && genotype.isFiltered())
                return false;
            return true;
        };
        PrintStream pw = openFileOrStdoutAsPrintStream(this.outFile);
        pw.print("#chrom");
        pw.print('\t');
        pw.print("min.POS");
        pw.print('\t');
        pw.print("max.POS");
        pw.print('\t');
        pw.print("gene.name");
        pw.print('\t');
        pw.print("gene.type");
        pw.print('\t');
        pw.print("samples.affected");
        pw.print('\t');
        pw.print("count.variations");
        if (!casesSamples.isEmpty()) {
            pw.print('\t');
            pw.print("pedigree.cases");
        }
        if (!controlsSamples.isEmpty()) {
            pw.print('\t');
            pw.print("pedigree.controls");
        }
        if (!maleSamples.isEmpty()) {
            pw.print('\t');
            pw.print("pedigree.males");
        }
        if (!femaleSamples.isEmpty()) {
            pw.print('\t');
            pw.print("pedigree.females");
        }
        if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
            pw.print('\t');
            pw.print("fisher");
        }
        for (final String sample : sampleNames) {
            pw.print('\t');
            pw.print(sample);
        }
        pw.println();
        final CloseableIterator<Call> iter = sortingCollection.iterator();
        final EqualRangeIterator<Call> eqiter = new EqualRangeIterator<>(iter, (C1, C2) -> C1.compareTo(C2));
        while (eqiter.hasNext()) {
            final List<Call> row = eqiter.next();
            final Call first = row.get(0);
            final List<VariantContext> variantList = row.stream().map(R -> GroupByGene.this.the_codec.decode(R.line)).collect(Collectors.toList());
            final int minPos = variantList.stream().mapToInt(R -> R.getStart()).min().getAsInt();
            final int maxPos = variantList.stream().mapToInt(R -> R.getEnd()).max().getAsInt();
            final Set<String> sampleCarryingMut = new HashSet<String>();
            final Counter<String> pedCasesCarryingMut = new Counter<String>();
            final Counter<String> pedCtrlsCarryingMut = new Counter<String>();
            final Counter<String> malesCarryingMut = new Counter<String>();
            final Counter<String> femalesCarryingMut = new Counter<String>();
            final Counter<String> sample2count = new Counter<String>();
            for (final VariantContext ctx : variantList) {
                for (final Genotype genotype : ctx.getGenotypes()) {
                    if (!genotypeFilter.test(genotype))
                        continue;
                    final String sampleName = genotype.getSampleName();
                    sample2count.incr(sampleName);
                    sampleCarryingMut.add(sampleName);
                    if (casesSamples.contains(sampleName)) {
                        pedCasesCarryingMut.incr(sampleName);
                    }
                    if (controlsSamples.contains(sampleName)) {
                        pedCtrlsCarryingMut.incr(sampleName);
                    }
                    if (maleSamples.contains(sampleName)) {
                        malesCarryingMut.incr(sampleName);
                    }
                    if (femaleSamples.contains(sampleName)) {
                        femalesCarryingMut.incr(sampleName);
                    }
                }
            }
            pw.print(first.getContig());
            pw.print('\t');
            // convert to bed
            pw.print(minPos - 1);
            pw.print('\t');
            pw.print(maxPos);
            pw.print('\t');
            pw.print(first.gene.name);
            pw.print('\t');
            pw.print(first.gene.type);
            pw.print('\t');
            pw.print(sampleCarryingMut.size());
            pw.print('\t');
            pw.print(variantList.size());
            if (!casesSamples.isEmpty()) {
                pw.print('\t');
                pw.print(pedCasesCarryingMut.getCountCategories());
            }
            if (!controlsSamples.isEmpty()) {
                pw.print('\t');
                pw.print(pedCtrlsCarryingMut.getCountCategories());
            }
            if (!maleSamples.isEmpty()) {
                pw.print('\t');
                pw.print(malesCarryingMut.getCountCategories());
            }
            if (!femaleSamples.isEmpty()) {
                pw.print('\t');
                pw.print(femalesCarryingMut.getCountCategories());
            }
            if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
                int count_case_mut = 0;
                int count_ctrl_mut = 0;
                int count_case_wild = 0;
                int count_ctrl_wild = 0;
                for (final VariantContext ctx : variantList) {
                    for (final Genotype genotype : ctx.getGenotypes()) {
                        final String sampleName = genotype.getSampleName();
                        final boolean has_mutation = genotypeFilter.test(genotype);
                        if (controlsSamples.contains(sampleName)) {
                            if (has_mutation) {
                                count_ctrl_mut++;
                            } else {
                                count_ctrl_wild++;
                            }
                        } else if (casesSamples.contains(sampleName)) {
                            if (has_mutation) {
                                count_case_mut++;
                            } else {
                                count_case_wild++;
                            }
                        }
                    }
                }
                final FisherExactTest fisher = FisherExactTest.compute(count_case_mut, count_case_wild, count_ctrl_mut, count_ctrl_wild);
                pw.print('\t');
                pw.print(fisher.getAsDouble());
            }
            for (final String sample : sampleNames) {
                pw.print('\t');
                pw.print(sample2count.count(sample));
            }
            pw.println();
            if (pw.checkError())
                break;
        }
        eqiter.close();
        iter.close();
        pw.flush();
        if (this.outFile != null)
            pw.close();
    } finally {
        CloserUtil.close(lineiter);
        if (sortingCollection != null)
            sortingCollection.cleanup();
    }
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) DataInputStream(java.io.DataInputStream) CloseableIterator(htsjdk.samtools.util.CloseableIterator) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) Arrays(java.util.Arrays) LineIterator(htsjdk.tribble.readers.LineIterator) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) AnnPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) Function(java.util.function.Function) ParametersDelegate(com.beust.jcommander.ParametersDelegate) HashSet(java.util.HashSet) DataOutputStream(java.io.DataOutputStream) StringUtil(htsjdk.samtools.util.StringUtil) FisherExactTest(com.github.lindenb.jvarkit.math.stats.FisherExactTest) AbstractDataCodec(com.github.lindenb.jvarkit.util.picard.AbstractDataCodec) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) VCFConstants(htsjdk.variant.vcf.VCFConstants) CloserUtil(htsjdk.samtools.util.CloserUtil) SnpEffPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.SnpEffPredictionParser) PrintStream(java.io.PrintStream) SortingCollection(htsjdk.samtools.util.SortingCollection) Counter(com.github.lindenb.jvarkit.util.Counter) AbstractVCFCodec(htsjdk.variant.vcf.AbstractVCFCodec) Predicate(java.util.function.Predicate) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) File(java.io.File) List(java.util.List) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VariantContext(htsjdk.variant.variantcontext.VariantContext) VcfTools(com.github.lindenb.jvarkit.util.vcf.VcfTools) Pattern(java.util.regex.Pattern) Comparator(java.util.Comparator) Collections(java.util.Collections) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) VariantContext(htsjdk.variant.variantcontext.VariantContext) LineIterator(htsjdk.tribble.readers.LineIterator) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) Counter(com.github.lindenb.jvarkit.util.Counter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) Pattern(java.util.regex.Pattern) PrintStream(java.io.PrintStream) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) Genotype(htsjdk.variant.variantcontext.Genotype) FisherExactTest(com.github.lindenb.jvarkit.math.stats.FisherExactTest) VcfTools(com.github.lindenb.jvarkit.util.vcf.VcfTools) Pedigree(com.github.lindenb.jvarkit.util.Pedigree)

Example 77 with SAMSequenceDictionaryProgress

use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.

the class VcfToHilbert method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.imgOut == null) {
        LOG.error("output image file not defined");
        return -1;
    }
    if (this.imageWidth < 1) {
        LOG.error("Bad image size:" + this.imageWidth);
        return -1;
    }
    VcfIterator iter = null;
    try {
        iter = this.openVcfIterator(oneFileOrNull(args));
        final VCFHeader header = iter.getHeader();
        this.dict = header.getSequenceDictionary();
        if (this.dict == null) {
            throw new JvarkitException.FastaDictionaryMissing("no dict in input");
        }
        final List<String> samples = header.getSampleNamesInOrder();
        if (samples.isEmpty()) {
            throw new JvarkitException.SampleMissing("no.sample.in.vcf");
        }
        LOG.info("N-Samples:" + samples.size());
        double marginWidth = (this.imageWidth - 2) * 0.05;
        this.sampleWidth = ((this.imageWidth - 2) - marginWidth) / samples.size();
        LOG.info("sample Width:" + sampleWidth);
        BufferedImage img = new BufferedImage(this.imageWidth, this.imageWidth, BufferedImage.TYPE_INT_RGB);
        this.g = (Graphics2D) img.getGraphics();
        this.g.setColor(Color.WHITE);
        this.g.fillRect(0, 0, imageWidth, imageWidth);
        g.setColor(Color.BLACK);
        final Hershey hershey = new Hershey();
        EvalCurve evalCurve = new EvalCurve();
        evalCurve.run();
        this.genomicSizePerCurveUnit = ((double) dict.getReferenceLength() / (double) (evalCurve.count));
        if (this.genomicSizePerCurveUnit < 1)
            this.genomicSizePerCurveUnit = 1;
        LOG.info("genomicSizePerCurveUnit:" + genomicSizePerCurveUnit);
        for (int x = 0; x < samples.size(); ++x) {
            String samplex = samples.get(x);
            double labelHeight = marginWidth;
            if (labelHeight > 50)
                labelHeight = 50;
            g.setColor(Color.BLACK);
            hershey.paint(g, samplex, marginWidth + x * sampleWidth, marginWidth - labelHeight, sampleWidth * 0.9, labelHeight * 0.9);
            AffineTransform old = g.getTransform();
            AffineTransform tr = AffineTransform.getTranslateInstance(marginWidth, marginWidth + x * sampleWidth);
            tr.rotate(Math.PI / 2);
            g.setTransform(tr);
            hershey.paint(g, samplex, 0.0, 0.0, sampleWidth * 0.9, labelHeight * 0.9);
            // g.drawString(this.tabixFile.getFile().getName(),0,0);
            g.setTransform(old);
            double tx = marginWidth + x * sampleWidth;
            for (int y = 0; y < samples.size(); ++y) {
                double ty = marginWidth + y * sampleWidth;
                g.translate(tx, ty);
                g.setColor(Color.BLUE);
                g.draw(new Rectangle2D.Double(0, 0, sampleWidth, sampleWidth));
                // paint each chromosome
                paintReference();
                g.translate(-tx, -ty);
            }
        }
        LOG.info("genomicSizePerCurveUnit:" + (long) genomicSizePerCurveUnit * evalCurve.count + " " + dict.getReferenceLength() + " count=" + evalCurve.count);
        LOG.info("Scanning variants");
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(this.dict);
        while (iter.hasNext()) {
            final VariantContext var = progress.watch(iter.next());
            for (int x = 0; x < samples.size(); ++x) {
                final String samplex = samples.get(x);
                final Genotype gx = var.getGenotype(samplex);
                if (!gx.isCalled())
                    continue;
                double tx = marginWidth + x * sampleWidth;
                for (int y = 0; y < samples.size(); ++y) {
                    final String sampley = samples.get(y);
                    final Genotype gy = var.getGenotype(sampley);
                    if (!gy.isCalled())
                        continue;
                    if (gx.isHomRef() && gy.isHomRef())
                        continue;
                    double ty = marginWidth + y * sampleWidth;
                    g.translate(tx, ty);
                    final PaintVariant paint = new PaintVariant(var, x, y);
                    paint.run();
                    g.translate(-tx, -ty);
                }
            }
        }
        progress.finish();
        this.g.dispose();
        // save file
        LOG.info("saving " + imgOut);
        if (imgOut != null) {
            ImageIO.write(img, imgOut.getName().toLowerCase().endsWith(".png") ? "PNG" : "JPG", imgOut);
        } else {
            ImageIO.write(img, "PNG", stdout());
        }
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
    }
}
Also used : SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) Hershey(com.github.lindenb.jvarkit.util.Hershey) Rectangle2D(java.awt.geom.Rectangle2D) VariantContext(htsjdk.variant.variantcontext.VariantContext) Genotype(htsjdk.variant.variantcontext.Genotype) BufferedImage(java.awt.image.BufferedImage) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) AffineTransform(java.awt.geom.AffineTransform) VCFHeader(htsjdk.variant.vcf.VCFHeader)

Example 78 with SAMSequenceDictionaryProgress

use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.

the class VcfMultiToOne method doWork.

@Override
public int doWork(final List<String> arguments) {
    VariantContextWriter out = null;
    Set<String> args = IOUtils.unrollFiles(arguments);
    List<VcfIterator> inputs = new ArrayList<>(args.size() + 1);
    List<String> inputFiles = new ArrayList<>(args.size() + 1);
    try {
        if (args.isEmpty() && arguments.isEmpty()) {
            inputs.add(VCFUtils.createVcfIteratorStdin());
            inputFiles.add("stdin");
        } else if (args.isEmpty()) {
            LOG.error("No vcf provided");
            return -1;
        } else {
            for (final String vcfFile : args) {
                inputs.add(VCFUtils.createVcfIterator(vcfFile));
                inputFiles.add(VCFUtils.escapeInfoField(vcfFile));
            }
        }
        SAMSequenceDictionary dict = null;
        final Set<String> sampleNames = new HashSet<String>();
        final Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
        for (final VcfIterator in : inputs) {
            final VCFHeader header = in.getHeader();
            if (dict == null) {
                dict = header.getSequenceDictionary();
            } else if (header.getSequenceDictionary() == null) {
                LOG.error("No Dictionary in vcf");
                return -1;
            } else if (!SequenceUtil.areSequenceDictionariesEqual(dict, header.getSequenceDictionary())) {
                LOG.error("Not the same dictionary between vcfs");
                return -1;
            }
            metaData.addAll(in.getHeader().getMetaDataInInputOrder());
            sampleNames.addAll(in.getHeader().getSampleNamesInOrder());
        }
        final Comparator<VariantContext> comparator = (dict == null ? VCFUtils.createChromPosRefComparator() : VCFUtils.createTidPosRefComparator(dict));
        // addMetaData(metaData);
        metaData.add(new VCFInfoHeaderLine(DEFAULT_SAMPLE_TAGID, 1, VCFHeaderLineType.String, "Sample Name from multi-sample vcf"));
        metaData.add(new VCFInfoHeaderLine(DEFAULT_SAMPLE_FILETAGID, 1, VCFHeaderLineType.String, "Origin of sample"));
        for (final String sample : sampleNames) {
            metaData.add(new VCFHeaderLine(SAMPLE_HEADER_DECLARATION, sample));
        }
        final VCFHeader h2 = new VCFHeader(metaData, Collections.singleton(DEFAULT_VCF_SAMPLE_NAME));
        recalculator.setHeader(h2);
        out = super.openVariantContextWriter(this.outputFile);
        out.writeHeader(h2);
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
        for (; ; ) {
            if (out.checkError())
                break;
            /* get 'smallest' variant */
            VariantContext smallest = null;
            int idx = 0;
            int best_idx = -1;
            while (idx < inputs.size()) {
                final VcfIterator in = inputs.get(idx);
                if (!in.hasNext()) {
                    CloserUtil.close(in);
                    inputs.remove(idx);
                    inputFiles.remove(idx);
                } else {
                    final VariantContext ctx = in.peek();
                    if (smallest == null || comparator.compare(smallest, ctx) > 0) {
                        smallest = ctx;
                        best_idx = idx;
                    }
                    ++idx;
                }
            }
            if (smallest == null)
                break;
            final VariantContext ctx = progress.watch(inputs.get(best_idx).next());
            if (ctx.getNSamples() == 0) {
                if (!this.discard_no_call) {
                    final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
                    vcb.attribute(DEFAULT_SAMPLE_FILETAGID, inputFiles.get(best_idx));
                    vcb.genotypes(GenotypeBuilder.createMissing(DEFAULT_VCF_SAMPLE_NAME, 2));
                    out.add(this.recalculator.apply(vcb.make()));
                }
                continue;
            }
            for (int i = 0; i < ctx.getNSamples(); ++i) {
                final Genotype g = ctx.getGenotype(i);
                final String sample = g.getSampleName();
                if (!g.isCalled() && this.discard_no_call)
                    continue;
                if (!g.isAvailable() && this.discard_non_available)
                    continue;
                if (g.isHomRef() && this.discard_hom_ref)
                    continue;
                final GenotypeBuilder gb = new GenotypeBuilder(g);
                gb.name(DEFAULT_VCF_SAMPLE_NAME);
                final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
                vcb.attribute(DEFAULT_SAMPLE_TAGID, sample);
                vcb.attribute(DEFAULT_SAMPLE_FILETAGID, inputFiles.get(best_idx));
                vcb.genotypes(gb.make());
                out.add(this.recalculator.apply(vcb.make()));
            }
        }
        progress.finish();
        LOG.debug("done");
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(inputs);
        CloserUtil.close(out);
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet)

Example 79 with SAMSequenceDictionaryProgress

use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.

the class PcrClipReads method run.

private int run(final SamReader reader) {
    final SAMFileHeader header1 = reader.getFileHeader();
    final SAMFileHeader header2 = header1.clone();
    Optional<SAMProgramRecord> samProgramRecord = Optional.empty();
    if (this.programId) {
        final SAMProgramRecord spr = header2.createProgramRecord();
        samProgramRecord = Optional.of(spr);
        spr.setProgramName(PcrClipReads.class.getSimpleName());
        spr.setProgramVersion(this.getGitHash());
        spr.setCommandLine(getProgramCommandLine().replace('\t', ' '));
    }
    header2.addComment(getProgramName() + " " + getVersion() + ": Processed with " + getProgramCommandLine());
    header2.setSortOrder(SortOrder.unsorted);
    SAMFileWriter sw = null;
    SAMRecordIterator iter = null;
    try {
        sw = this.writingBamArgs.openSAMFileWriter(outputFile, header2, false);
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1).logger(LOG);
        iter = reader.iterator();
        while (iter.hasNext()) {
            SAMRecord rec = progress.watch(iter.next());
            if (this.onlyFlag != -1 && (rec.getFlags() & this.onlyFlag) != 0) {
                sw.addAlignment(rec);
                continue;
            }
            if (rec.getReadUnmappedFlag()) {
                sw.addAlignment(rec);
                continue;
            }
            final Interval fragment = findInterval(rec);
            if (fragment == null) {
                rec.setMappingQuality(0);
                sw.addAlignment(rec);
                continue;
            }
            // strand is '-' and overap in 5' of PCR fragment
            if (rec.getReadNegativeStrandFlag() && fragment.getStart() < rec.getAlignmentStart() && rec.getAlignmentStart() < fragment.getEnd()) {
                rec.setMappingQuality(0);
                sw.addAlignment(rec);
                continue;
            }
            // strand is '+' and overap in 3' of PCR fragment
            if (!rec.getReadNegativeStrandFlag() && fragment.getStart() < rec.getAlignmentEnd() && rec.getAlignmentEnd() < fragment.getEnd()) {
                rec.setMappingQuality(0);
                sw.addAlignment(rec);
                continue;
            }
            // contained int PCR fragment
            if (rec.getAlignmentStart() >= fragment.getStart() && rec.getAlignmentEnd() <= fragment.getEnd()) {
                sw.addAlignment(rec);
                continue;
            }
            final ReadClipper readClipper = new ReadClipper();
            if (samProgramRecord.isPresent()) {
                readClipper.setProgramGroup(samProgramRecord.get().getId());
            }
            rec = readClipper.clip(rec, fragment);
            sw.addAlignment(rec);
        }
        progress.finish();
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(sw);
    }
}
Also used : SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SAMFileWriter(htsjdk.samtools.SAMFileWriter) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) SAMProgramRecord(htsjdk.samtools.SAMProgramRecord) Interval(htsjdk.samtools.util.Interval)

Example 80 with SAMSequenceDictionaryProgress

use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.

the class PcrSliceReads method run.

@SuppressWarnings("resource")
private int run(SamReader reader) {
    ReadClipper readClipper = new ReadClipper();
    SAMFileHeader header1 = reader.getFileHeader();
    SAMFileHeader header2 = header1.clone();
    header2.addComment(getProgramName() + " " + getVersion() + ": Processed with " + getProgramCommandLine());
    header2.setSortOrder(SortOrder.unsorted);
    for (SAMReadGroupRecord srg : header1.getReadGroups()) {
        for (Interval i : this.bedIntervals.keySet()) {
            // create new read group for this interval
            SAMReadGroupRecord rg = new SAMReadGroupRecord(srg.getId() + "_" + this.bedIntervals.get(i).getName(), srg);
            header2.addReadGroup(rg);
        }
    }
    SAMFileWriter sw = null;
    SAMRecordIterator iter = null;
    try {
        sw = writingBamArgs.openSAMFileWriter(outputFile, header2, false);
        SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1);
        iter = reader.iterator();
        while (iter.hasNext()) {
            SAMRecord rec = progress.watch(iter.next());
            if (rec.getReadUnmappedFlag()) {
                sw.addAlignment(rec);
                continue;
            }
            if (!rec.getReadPairedFlag()) {
                // @doc if the read is not a paired-end read ,  then the quality of the read is set to zero
                rec.setMappingQuality(0);
                sw.addAlignment(rec);
                continue;
            }
            if (rec.getMateUnmappedFlag()) {
                // @doc if the MATE is not mapped ,  then the quality of the current read is set to zero
                rec.setMappingQuality(0);
                sw.addAlignment(rec);
                continue;
            }
            if (!rec.getProperPairFlag()) {
                // @doc if the properly-paired flag is not set,  then the quality of the current read is set to zero
                rec.setMappingQuality(0);
                sw.addAlignment(rec);
                continue;
            }
            if (rec.getMateReferenceIndex() != rec.getReferenceIndex()) {
                // @doc if the read and the mate are not mapped on the same chromosome,  then the quality of the current read is set to zero
                rec.setMappingQuality(0);
                sw.addAlignment(rec);
                continue;
            }
            if (rec.getReadNegativeStrandFlag() == rec.getMateNegativeStrandFlag()) {
                // @doc if the read and the mate are mapped on the same strand,  then the quality of the current read is set to zero
                rec.setMappingQuality(0);
                sw.addAlignment(rec);
                continue;
            }
            int chromStart;
            int chromEnd;
            if (rec.getAlignmentStart() < rec.getMateAlignmentStart()) {
                if (rec.getReadNegativeStrandFlag()) {
                    // @doc if the read(POS) < mate(POS) and read is mapped on the negative strand, then the quality of the current read is set to zero
                    rec.setMappingQuality(0);
                    sw.addAlignment(rec);
                    continue;
                } else {
                    chromStart = rec.getAlignmentStart();
                    chromEnd = rec.getMateAlignmentStart();
                }
            } else {
                if (!rec.getReadNegativeStrandFlag()) {
                    // @doc if the read(POS) > mate(POS) and read is mapped on the positive strand, then the quality of the current read is set to zero
                    rec.setMappingQuality(0);
                    sw.addAlignment(rec);
                    continue;
                } else {
                    chromStart = rec.getMateAlignmentStart();
                    // don't use getAlignmentEnd, to be consistent with mate data
                    chromEnd = rec.getAlignmentStart();
                }
            }
            List<Interval> fragments = findIntervals(rec.getContig(), chromStart, chromEnd);
            if (fragments.isEmpty()) {
                // @doc if no BED fragment is found overlapping the read, then the quality of the read is set to zero
                rec.setMappingQuality(0);
                sw.addAlignment(rec);
                continue;
            }
            Interval best = null;
            if (fragments.size() == 1) {
                best = fragments.get(0);
            } else
                switch(this.ambiguityStrategy) {
                    case random:
                        {
                            best = fragments.get(this.random.nextInt(fragments.size()));
                            break;
                        }
                    case zero:
                        {
                            rec.setMappingQuality(0);
                            sw.addAlignment(rec);
                            continue;
                        }
                    case closer:
                        {
                            int best_distance = Integer.MAX_VALUE;
                            for (Interval frag : fragments) {
                                int distance = Math.abs(chromStart - frag.getStart()) + Math.abs(frag.getEnd() - chromEnd);
                                if (best == null || distance < best_distance) {
                                    best = frag;
                                    best_distance = distance;
                                }
                            }
                            break;
                        }
                    default:
                        throw new IllegalStateException();
                }
            if (clip_reads) {
                rec = readClipper.clip(rec, best);
                if (rec.getMappingQuality() == 0) {
                    sw.addAlignment(rec);
                    continue;
                }
            }
            SAMReadGroupRecord rg = rec.getReadGroup();
            if (rg == null) {
                throw new IOException("Read " + rec + " is missing a Read-Group ID . See http://broadinstitute.github.io/picard/ http://broadinstitute.github.io/picard/command-line-overview.html#AddOrReplaceReadGroups");
            }
            rec.setAttribute("RG", rg.getId() + "_" + best.getName());
            sw.addAlignment(rec);
        }
        progress.finish();
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(sw);
    }
}
Also used : SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SAMFileWriter(htsjdk.samtools.SAMFileWriter) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) IOException(java.io.IOException) IOException(java.io.IOException) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) Interval(htsjdk.samtools.util.Interval)

Aggregations

SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)146 ArrayList (java.util.ArrayList)64 VariantContext (htsjdk.variant.variantcontext.VariantContext)59 VCFHeader (htsjdk.variant.vcf.VCFHeader)57 SAMRecord (htsjdk.samtools.SAMRecord)54 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)54 SAMRecordIterator (htsjdk.samtools.SAMRecordIterator)48 IOException (java.io.IOException)48 File (java.io.File)47 SamReader (htsjdk.samtools.SamReader)40 SAMFileHeader (htsjdk.samtools.SAMFileHeader)38 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)37 HashSet (java.util.HashSet)34 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)32 VcfIterator (com.github.lindenb.jvarkit.util.vcf.VcfIterator)30 List (java.util.List)30 VCFHeaderLine (htsjdk.variant.vcf.VCFHeaderLine)29 HashMap (java.util.HashMap)28 Parameter (com.beust.jcommander.Parameter)27 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)27