Search in sources :

Example 6 with Counter

use of com.github.lindenb.jvarkit.util.Counter in project jvarkit by lindenb.

the class ExtendReferenceWithReads method scanRegion.

/**
 *scanRegion
 * @param contig    Reference sequence of interest.
 * @param start     0-based, inclusive start of interval of interest. Zero implies start of the reference sequence.
 * @param end       0-based, exclusive end of interval of interest. Zero implies end of the reference sequence.
 */
private Map<Integer, Counter<Byte>> scanRegion(final SAMSequenceRecord contig, final int chromStart, final int chromEnd, final Rescue rescue) {
    final Map<Integer, Counter<Byte>> pos2bases = new HashMap<>(1 + chromEnd - chromStart);
    LOG.info("Scanning :" + contig.getSequenceName() + ":" + chromStart + "-" + chromEnd);
    for (int side = 0; side < 2; ++side) {
        if (// 5' or 3'
        !rescue.equals(Rescue.CENTER) && side > 0) {
            // already done
            break;
        }
        for (final SamReader samReader : samReaders) {
            final SAMSequenceDictionary dict2 = samReader.getFileHeader().getSequenceDictionary();
            final SAMSequenceRecord ssr2 = dict2.getSequence(contig.getSequenceName());
            if (ssr2 == null || ssr2.getSequenceLength() != contig.getSequenceLength()) {
                LOG.info("No contig " + contig.getSequenceName() + " with L=" + contig.getSequenceLength() + " bases in " + samReader.getResourceDescription());
                continue;
            }
            int mappedPos = -1;
            switch(rescue) {
                case LEFT:
                    mappedPos = 1;
                    break;
                case RIGHT:
                    mappedPos = contig.getSequenceLength();
                    break;
                case CENTER:
                    mappedPos = (side == 0 ? chromStart + 1 : chromEnd);
                    break;
                default:
                    throw new IllegalStateException();
            }
            final SAMRecordIterator iter = samReader.query(contig.getSequenceName(), mappedPos, mappedPos, false);
            while (iter.hasNext()) {
                final SAMRecord rec = iter.next();
                if (rec.getReadUnmappedFlag())
                    continue;
                if (this.filter.filterOut(rec))
                    continue;
                final Cigar cigar = rec.getCigar();
                if (cigar == null)
                    continue;
                final byte[] bases = rec.getReadBases();
                if (bases == null || bases.length == 0)
                    continue;
                int refPos1 = rec.getUnclippedStart();
                int readpos = 0;
                for (final CigarElement ce : cigar) {
                    final CigarOperator op = ce.getOperator();
                    for (int L = 0; L < ce.getLength(); ++L) {
                        if (op.consumesReadBases()) {
                            Counter<Byte> count = pos2bases.get(refPos1 - 1);
                            if (count == null) {
                                count = new Counter<>();
                                pos2bases.put(refPos1 - 1, count);
                            }
                            count.incr((byte) Character.toLowerCase(bases[readpos]));
                            readpos++;
                        }
                        if (op.consumesReferenceBases()) {
                            refPos1++;
                        }
                    }
                }
            }
            iter.close();
        }
    }
    return pos2bases;
}
Also used : SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) HashMap(java.util.HashMap) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) CigarOperator(htsjdk.samtools.CigarOperator) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) CigarElement(htsjdk.samtools.CigarElement) SamReader(htsjdk.samtools.SamReader) Counter(com.github.lindenb.jvarkit.util.Counter) Cigar(htsjdk.samtools.Cigar) SAMRecord(htsjdk.samtools.SAMRecord)

Example 7 with Counter

use of com.github.lindenb.jvarkit.util.Counter in project jvarkit by lindenb.

the class ExtendReferenceWithReads method doWork.

@Override
public int doWork(List<String> args) {
    if (this.faidx == null) {
        LOG.error("No REF defined");
        return -1;
    }
    this.samReaders.clear();
    PrintStream out = null;
    try {
        this.indexedFastaSequenceFile = new IndexedFastaSequenceFile(faidx);
        SAMSequenceDictionary dict = this.indexedFastaSequenceFile.getSequenceDictionary();
        if (dict == null) {
            LOG.error("Reference file is missing a sequence dictionary (use picard)");
            return -1;
        }
        final SamReaderFactory srf = super.createSamReaderFactory();
        srf.setOption(Option.CACHE_FILE_BASED_INDEXES, true);
        for (final String uri : IOUtils.unrollFiles(args)) {
            LOG.info("opening BAM " + uri);
            final SamReader sr = srf.open(SamInputResource.of(uri));
            /* doesn't work with remote ?? */
            if (!sr.hasIndex()) {
                LOG.error("file " + uri + " is not indexed");
                return -1;
            }
            final SAMFileHeader header = sr.getFileHeader();
            if (!header.getSortOrder().equals(SortOrder.coordinate)) {
                LOG.error("file " + uri + " not sorted on coordinate but " + header.getSortOrder());
                return -1;
            }
            final SAMSequenceDictionary dict2 = header.getSequenceDictionary();
            if (dict2 == null) {
                LOG.error("file " + uri + " needs a sequence dictionary");
                return -1;
            }
            samReaders.add(sr);
        }
        if (samReaders.isEmpty()) {
            LOG.error("No BAM defined");
            return -1;
        }
        out = super.openFileOrStdoutAsPrintStream(this.outputFile);
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
        for (final SAMSequenceRecord ssr : dict.getSequences()) {
            final GenomicSequence genomic = new GenomicSequence(this.indexedFastaSequenceFile, ssr.getSequenceName());
            int chromStart = 0;
            int nPrinted = 0;
            out.print(">");
            out.print(ssr.getSequenceName());
            for (final Rescue side : Rescue.values()) {
                switch(side) {
                    case LEFT:
                        /* look before position 0 of chromosome */
                        {
                            final Map<Integer, Counter<Byte>> pos2bases = scanRegion(ssr, -1, -1, side);
                            int newstart = 0;
                            for (final Integer pos : pos2bases.keySet()) {
                                if (pos >= 0)
                                    continue;
                                newstart = Math.min(newstart, pos);
                            }
                            while (newstart < 0) {
                                final Counter<Byte> count = pos2bases.get(newstart);
                                if (nPrinted % 60 == 0)
                                    out.println();
                                out.print(consensus(count));
                                newstart++;
                                ++nPrinted;
                            }
                            break;
                        }
                    case RIGHT:
                        /* look after position > length(chromosome) */
                        {
                            final Map<Integer, Counter<Byte>> pos2bases = this.scanRegion(ssr, -1, -1, side);
                            int newend = ssr.getSequenceLength();
                            for (final Integer pos : pos2bases.keySet()) {
                                if (pos < ssr.getSequenceLength())
                                    continue;
                                newend = Math.max(newend, pos + 1);
                            }
                            for (int i = ssr.getSequenceLength(); i < newend; i++) {
                                final Counter<Byte> count = pos2bases.get(i);
                                if (nPrinted % 60 == 0)
                                    out.println();
                                out.print(consensus(count));
                                ++nPrinted;
                            }
                            break;
                        }
                    case CENTER:
                        /* 0 to chromosome-length */
                        {
                            chromStart = 0;
                            while (chromStart < genomic.length()) {
                                final char base = Character.toUpperCase(genomic.charAt(chromStart));
                                if (base != 'N') {
                                    progress.watch(ssr.getSequenceName(), chromStart);
                                    if (nPrinted % 60 == 0)
                                        out.println();
                                    out.print(base);
                                    ++chromStart;
                                    ++nPrinted;
                                    continue;
                                }
                                int chromEnd = chromStart + 1;
                                while (chromEnd < genomic.length() && Character.toUpperCase(genomic.charAt(chromEnd)) == 'N') {
                                    ++chromEnd;
                                }
                                if (chromEnd - chromStart < this.minLenNNNNContig) {
                                    while (chromStart < chromEnd) {
                                        progress.watch(ssr.getSequenceName(), chromStart);
                                        if (nPrinted % 60 == 0)
                                            out.println();
                                        out.print(base);
                                        ++chromStart;
                                        ++nPrinted;
                                    }
                                    continue;
                                }
                                final Map<Integer, Counter<Byte>> pos2bases = scanRegion(ssr, chromStart, chromEnd, side);
                                while (chromStart < chromEnd) {
                                    final Counter<Byte> count = pos2bases.get(chromStart);
                                    if (nPrinted % 60 == 0)
                                        out.println();
                                    if (count == null) {
                                        out.print('N');
                                    } else {
                                        out.print(consensus(count));
                                    }
                                    ++chromStart;
                                    ++nPrinted;
                                    continue;
                                }
                            }
                            break;
                        }
                }
            // end switch type
            }
            out.println();
        }
        progress.finish();
        out.flush();
        out.close();
        return RETURN_OK;
    } catch (final Exception e) {
        LOG.error(e);
        return -1;
    } finally {
        CloserUtil.close(this.indexedFastaSequenceFile);
        for (final SamReader r : samReaders) CloserUtil.close(r);
        samReaders.clear();
    }
}
Also used : PrintStream(java.io.PrintStream) SamReaderFactory(htsjdk.samtools.SamReaderFactory) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) SamReader(htsjdk.samtools.SamReader) Counter(com.github.lindenb.jvarkit.util.Counter) SAMFileHeader(htsjdk.samtools.SAMFileHeader) HashMap(java.util.HashMap) Map(java.util.Map)

Example 8 with Counter

use of com.github.lindenb.jvarkit.util.Counter in project jvarkit by lindenb.

the class VcfStage method makeGenotypePie.

private PieChart makeGenotypePie(final VariantContext ctx) {
    final Counter<GenotypeType> countTypes = new Counter<>();
    if (ctx != null) {
        for (final Genotype g : ctx.getGenotypes()) {
            // ignore genotype if not displayed
            final SampleDef sampleDef = this.name2sampledef.get(g.getSampleName());
            if (sampleDef == null || !sampleDef.isDisplayed())
                continue;
            countTypes.incr(g.getType());
        }
    }
    final ObservableList<PieChart.Data> pieChartData = FXCollections.observableArrayList();
    for (final GenotypeType t : GenotypeType.values()) {
        int c = (int) countTypes.count(t);
        if (c == 0)
            continue;
        pieChartData.add(new PieChart.Data(t.name() + " = " + c, c));
    }
    final PieChart chart = new PieChart(pieChartData);
    chart.setLegendVisible(false);
    return chart;
}
Also used : Counter(com.github.lindenb.jvarkit.util.Counter) PieChart(javafx.scene.chart.PieChart) GenotypeType(htsjdk.variant.variantcontext.GenotypeType) Genotype(htsjdk.variant.variantcontext.Genotype) Paint(javafx.scene.paint.Paint)

Example 9 with Counter

use of com.github.lindenb.jvarkit.util.Counter in project jvarkit by lindenb.

the class VCFComm method doWork.

@Override
public int doWork(final List<String> args) {
    CloseableIterator<LineAndFile> iter = null;
    SortingCollection<LineAndFile> variants = null;
    VariantContextWriter w = null;
    try {
        if (args.isEmpty()) {
            LOG.error("Illegal number of arguments");
            return -1;
        }
        Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
        variants = SortingCollection.newInstance(LineAndFile.class, new LineAndFileCodec(), new LineAndFileComparator(), super.sortingCollectionArgs.getMaxRecordsInRam(), super.sortingCollectionArgs.getTmpPaths());
        variants.setDestructiveIteration(true);
        /**
         * new sample names in the output vcf: one  sample per file
         */
        final Map<Integer, String> fileid2sampleName = new TreeMap<>();
        /**
         * samples names as they appear in the original VCF headers
         */
        final Counter<String> countInputSamples = new Counter<String>();
        /**
         * dicts
         */
        final List<SAMSequenceDictionary> all_dictionaries = new ArrayList<>();
        for (final String vcffilename : IOUtils.unrollFiles(args)) {
            LOG.info("Reading from " + vcffilename);
            final Input input = super.put(variants, vcffilename);
            String sampleName = vcffilename;
            if (sampleName.endsWith(".vcf.gz")) {
                sampleName = sampleName.substring(0, sampleName.length() - 7);
            } else if (sampleName.endsWith(".vcf.gz")) {
                sampleName = sampleName.substring(0, sampleName.length() - 4);
            }
            int slash = sampleName.lastIndexOf(File.separatorChar);
            if (slash != -1)
                sampleName = sampleName.substring(slash + 1);
            int suffix = 1;
            // loop until we find a uniq name
            for (; ; ) {
                final String key = sampleName + (suffix == 1 ? "" : "_" + suffix);
                if (fileid2sampleName.values().contains(key)) {
                    suffix++;
                    continue;
                }
                fileid2sampleName.put(input.file_id, key);
                metaData.add(new VCFHeaderLine(key, vcffilename));
                break;
            }
            for (final String sname : input.codecAndHeader.header.getSampleNamesInOrder()) {
                countInputSamples.incr(sname);
            }
            all_dictionaries.add(input.codecAndHeader.header.getSequenceDictionary());
        }
        variants.doneAdding();
        /**
         * unique sample name, if any present in all VCF
         */
        Optional<String> unqueSampleName = Optional.empty();
        if (countInputSamples.getCountCategories() == 1 && countInputSamples.count(countInputSamples.keySet().iterator().next()) == fileid2sampleName.size()) {
            unqueSampleName = Optional.of(countInputSamples.keySet().iterator().next());
            LOG.info("Unique sample name is " + unqueSampleName.get());
        }
        VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_QUALITY_KEY, VCFConstants.GENOTYPE_KEY, VCFConstants.GENOTYPE_FILTER_KEY);
        VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.DEPTH_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_NUMBER_KEY);
        metaData.add(new VCFHeaderLine(getClass().getSimpleName(), "version:" + getVersion() + " command:" + getProgramCommandLine()));
        final VCFFilterHeaderLine variantNotCalledInAllVcf = new VCFFilterHeaderLine("NotCalledEveryWhere", "Variant was NOT called in all input VCF");
        metaData.add(variantNotCalledInAllVcf);
        final VCFFilterHeaderLine variantWasFiltered = new VCFFilterHeaderLine("VariantWasFiltered", "At least one variant was filtered");
        metaData.add(variantWasFiltered);
        final VCFFormatHeaderLine variantQUALFormat = new VCFFormatHeaderLine("VCQUAL", 1, VCFHeaderLineType.Float, "Variant Quality");
        metaData.add(variantQUALFormat);
        metaData.add(new VCFFormatHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Number of allle in the src vcf"));
        metaData.add(new VCFFormatHeaderLine(VCFConstants.ALLELE_COUNT_KEY, 1, VCFHeaderLineType.Integer, "Number of ALT alllele"));
        final VCFInfoHeaderLine foundInCountVcfInfo = new VCFInfoHeaderLine("NVCF", 1, VCFHeaderLineType.Integer, "Number of VCF this variant was found");
        metaData.add(foundInCountVcfInfo);
        final VCFInfoHeaderLine variantTypesInfo = new VCFInfoHeaderLine("VTYPES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Distinct Variants type");
        metaData.add(variantTypesInfo);
        final VCFFilterHeaderLine multipleTypeFilters = new VCFFilterHeaderLine("DiscordantTypes", "Discordant types at this position");
        metaData.add(multipleTypeFilters);
        final VCFFormatHeaderLine variantTypeFormat = new VCFFormatHeaderLine("VTYPE", 1, VCFHeaderLineType.String, "Variant Type");
        metaData.add(variantTypeFormat);
        final VCFFilterHeaderLine uniqueVariantDiscordantGTFilter;
        if (unqueSampleName.isPresent()) {
            metaData.add(new VCFHeaderLine("UniqSample", unqueSampleName.get()));
            uniqueVariantDiscordantGTFilter = new VCFFilterHeaderLine("DiscordantGenotypeForUniqSample", "Genotype Dicordant for for sample " + unqueSampleName.get());
            metaData.add(uniqueVariantDiscordantGTFilter);
        } else {
            uniqueVariantDiscordantGTFilter = null;
        }
        final VCFHeader header = new VCFHeader(metaData, new ArrayList<>(fileid2sampleName.values()));
        if (// all have a dict
        !normalize_chr && !all_dictionaries.contains(null)) {
            SAMSequenceDictionary thedict = null;
            for (int x = 0; x < all_dictionaries.size(); ++x) {
                SAMSequenceDictionary d = all_dictionaries.get(x);
                if (thedict == null) {
                    thedict = d;
                } else if (!SequenceUtil.areSequenceDictionariesEqual(d, thedict)) {
                    thedict = null;
                    break;
                }
            }
            if (thedict != null)
                header.setSequenceDictionary(thedict);
        }
        w = super.openVariantContextWriter(super.outputFile);
        w.writeHeader(header);
        final List<LineAndFile> row = new ArrayList<LineAndFile>(super.inputs.size());
        final Comparator<LineAndFile> posCompare = (A, B) -> A.getContigPosRef().compareTo(B.getContigPosRef());
        iter = variants.iterator();
        for (; ; ) {
            LineAndFile rec = null;
            if (iter.hasNext()) {
                rec = iter.next();
            }
            if (rec == null || (!row.isEmpty() && posCompare.compare(row.get(0), rec) != 0)) {
                if (!row.isEmpty()) {
                    final VariantContext first = row.get(0).getContext();
                    /* in which file id we find this variant */
                    Set<Integer> fileids_for_variant = row.stream().map(LAF -> LAF.fileIdx).collect(Collectors.toSet());
                    // see with HAS multiple chrom/pos/ref but different alt
                    if (row.size() != fileids_for_variant.size()) {
                        for (; ; ) {
                            boolean ok = true;
                            for (int x = 0; ok && x + 1 < row.size(); ++x) {
                                final VariantContext ctxx = row.get(x).getContext();
                                final List<Allele> altsx = ctxx.getAlternateAlleles();
                                for (int y = x + 1; ok && y < row.size(); ++y) {
                                    if (row.get(x).fileIdx != row.get(y).fileIdx)
                                        continue;
                                    final VariantContext ctxy = row.get(y).getContext();
                                    final List<Allele> altsy = ctxy.getAlternateAlleles();
                                    if (altsx.equals(altsy))
                                        continue;
                                    if (!ctxx.isVariant() && ctxy.isVariant()) {
                                        row.remove(x);
                                    } else if (ctxx.isVariant() && !ctxy.isVariant()) {
                                        row.remove(y);
                                    } else if (!ctxx.isSNP() && ctxy.isSNP()) {
                                        row.remove(x);
                                    } else if (ctxx.isSNP() && !ctxy.isSNP()) {
                                        row.remove(y);
                                    } else if (altsx.size() > altsy.size()) {
                                        row.remove(x);
                                    } else if (altsx.size() < altsy.size()) {
                                        row.remove(y);
                                    } else {
                                        row.remove(y);
                                    }
                                    ok = false;
                                    break;
                                }
                            }
                            if (ok)
                                break;
                        }
                        fileids_for_variant = row.stream().map(LAF -> LAF.fileIdx).collect(Collectors.toSet());
                    }
                    if (row.size() != fileids_for_variant.size()) {
                        LOG.error("There are some duplicated variants at the position " + new ContigPosRef(first) + " in the same vcf file");
                        for (final LineAndFile laf : row) {
                            LOG.error("File [" + laf.fileIdx + "]" + fileid2sampleName.get(laf.fileIdx));
                            LOG.error("\t" + laf.getContigPosRef());
                        }
                        row.clear();
                    } else {
                        final Set<Allele> alleles = row.stream().flatMap(R -> R.getContext().getAlleles().stream()).collect(Collectors.toSet());
                        final VariantContextBuilder vcb = new VariantContextBuilder(getClass().getName(), first.getContig(), first.getStart(), first.getEnd(), alleles);
                        final Set<String> filters = new HashSet<>();
                        final Set<VariantContext.Type> variantContextTypes = new HashSet<>();
                        final List<Genotype> genotypes = new ArrayList<Genotype>();
                        for (final LineAndFile laf : row) {
                            if (laf.getContext().isFiltered())
                                filters.add(variantWasFiltered.getID());
                            variantContextTypes.add(laf.getContext().getType());
                            final GenotypeBuilder gbuilder = new GenotypeBuilder();
                            gbuilder.name(fileid2sampleName.get(laf.fileIdx));
                            if (unqueSampleName.isPresent()) {
                                final Genotype g0 = laf.getContext().getGenotype(unqueSampleName.get());
                                if (g0 == null) {
                                    iter.close();
                                    w.close();
                                    throw new IllegalStateException("Cannot find genotype for " + unqueSampleName.get());
                                }
                                if (g0.hasDP())
                                    gbuilder.DP(g0.getDP());
                                if (g0.hasGQ())
                                    gbuilder.GQ(g0.getGQ());
                                gbuilder.alleles(g0.getAlleles());
                            } else {
                                gbuilder.alleles(Arrays.asList(first.getReference(), first.getReference()));
                                if (laf.getContext().hasAttribute(VCFConstants.DEPTH_KEY)) {
                                    gbuilder.DP(laf.getContext().getAttributeAsInt(VCFConstants.DEPTH_KEY, 0));
                                }
                            }
                            if (laf.getContext().isFiltered()) {
                                gbuilder.filter("VCFFILTERED");
                            }
                            if (laf.getContext().hasLog10PError()) {
                                gbuilder.attribute(variantQUALFormat.getID(), laf.getContext().getPhredScaledQual());
                            }
                            gbuilder.attribute(VCFConstants.ALLELE_NUMBER_KEY, laf.getContext().getGenotypes().stream().flatMap(G -> G.getAlleles().stream()).filter(A -> !A.isNoCall()).count());
                            gbuilder.attribute(VCFConstants.ALLELE_COUNT_KEY, laf.getContext().getGenotypes().stream().flatMap(G -> G.getAlleles().stream()).filter(A -> !(A.isReference() || A.isNoCall())).count());
                            gbuilder.attribute(variantTypeFormat.getID(), laf.getContext().getType().name());
                            genotypes.add(gbuilder.make());
                        }
                        final String id = String.join(";", row.stream().map(LAF -> LAF.getContext()).filter(V -> V.hasID()).map(V -> V.getID()).collect(Collectors.toSet()));
                        if (!id.isEmpty())
                            vcb.id(id);
                        vcb.genotypes(genotypes);
                        if (unqueSampleName.isPresent()) {
                            boolean all_same = true;
                            for (int x = 0; all_same && x + 1 < genotypes.size(); ++x) {
                                if (!genotypes.get(x).isCalled())
                                    continue;
                                for (int y = x + 1; all_same && y < genotypes.size(); ++y) {
                                    if (!genotypes.get(y).isCalled())
                                        continue;
                                    if (!genotypes.get(x).sameGenotype(genotypes.get(y), true)) {
                                        all_same = false;
                                        break;
                                    }
                                }
                            }
                            if (!all_same)
                                filters.add(uniqueVariantDiscordantGTFilter.getID());
                        }
                        // Add AN
                        vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, genotypes.stream().filter(G -> G.isCalled()).mapToInt(G -> G.getAlleles().size()).sum());
                        if (!variantContextTypes.isEmpty()) {
                            vcb.attribute(variantTypesInfo.getID(), new ArrayList<>(variantContextTypes.stream().map(T -> T.name()).collect(Collectors.toSet())));
                            if (variantContextTypes.size() > 1) {
                                filters.add(multipleTypeFilters.getID());
                            }
                        }
                        vcb.attribute(foundInCountVcfInfo.getID(), fileids_for_variant.size());
                        boolean print = true;
                        if (row.size() == super.inputs.size() && ignore_everywhere) {
                            print = false;
                        }
                        if (fileids_for_variant.size() != fileid2sampleName.size()) {
                            filters.add(variantNotCalledInAllVcf.getID());
                            if (only_everywhere) {
                                print = false;
                            }
                        }
                        vcb.filters(filters);
                        if (print) {
                            w.add(vcb.make());
                        }
                    }
                    row.clear();
                }
                if (rec == null)
                    break;
            }
            row.add(rec);
        }
        iter.close();
        iter = null;
        w.close();
        w = null;
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(w);
        try {
            if (variants != null)
                variants.cleanup();
        } catch (Exception err) {
        }
    }
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) SequenceUtil(htsjdk.samtools.util.SequenceUtil) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ContigPosRef(com.github.lindenb.jvarkit.util.vcf.ContigPosRef) Map(java.util.Map) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) CloserUtil(htsjdk.samtools.util.CloserUtil) VCFConstants(htsjdk.variant.vcf.VCFConstants) Counter(com.github.lindenb.jvarkit.util.Counter) SortingCollection(htsjdk.samtools.util.SortingCollection) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Set(java.util.Set) Collectors(java.util.stream.Collectors) File(java.io.File) List(java.util.List) TreeMap(java.util.TreeMap) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Optional(java.util.Optional) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Pattern(java.util.regex.Pattern) Comparator(java.util.Comparator) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Counter(com.github.lindenb.jvarkit.util.Counter) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) ContigPosRef(com.github.lindenb.jvarkit.util.vcf.ContigPosRef) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) TreeMap(java.util.TreeMap) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder)

Example 10 with Counter

use of com.github.lindenb.jvarkit.util.Counter in project jvarkit by lindenb.

the class SamClipIndelFraction method doWork.

@Override
public int doWork(final List<String> args) {
    SamReader sfr = null;
    SAMRecordIterator iter = null;
    PrintWriter pw = null;
    try {
        sfr = openSamReader(oneFileOrNull(args));
        pw = super.openFileOrStdoutAsPrintWriter(outputFile);
        long total_bases_count = 0L;
        long count_clipped_reads = 0L;
        long count_clipped_left_reads = 0L;
        long count_clipped_right_reads = 0L;
        long count_unclipped_reads = 0L;
        long count_unmapped_reads = 0L;
        SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(sfr.getFileHeader()).logger(LOG);
        Counter<Integer> counter = new Counter<>();
        iter = sfr.iterator();
        while (iter.hasNext()) {
            final SAMRecord record = progress.watch(iter.next());
            if (record.getReadUnmappedFlag()) {
                ++count_unmapped_reads;
                continue;
            }
            final Cigar cigar = record.getCigar();
            int left_clip_length = 0;
            int right_clip_length = 0;
            int deletion_N_length = 0;
            int deletion_D_length = 0;
            int insertion_length = 0;
            boolean onLeft = true;
            for (int i = 0; i < cigar.numCigarElements(); ++i) {
                final CigarElement ce = cigar.getCigarElement(i);
                final CigarOperator op = ce.getOperator();
                switch(op) {
                    case N:
                        {
                            onLeft = false;
                            deletion_D_length += ce.getLength();
                            total_bases_count += ce.getLength();
                            break;
                        }
                    case D:
                        {
                            onLeft = false;
                            deletion_N_length += ce.getLength();
                            total_bases_count += ce.getLength();
                            break;
                        }
                    case I:
                        {
                            onLeft = false;
                            insertion_length += ce.getLength();
                            total_bases_count += ce.getLength();
                            break;
                        }
                    case S:
                    case H:
                        {
                            if (onLeft) {
                                if (record.getReadNegativeStrandFlag()) {
                                    right_clip_length += ce.getLength();
                                } else {
                                    left_clip_length += ce.getLength();
                                }
                            } else {
                                if (record.getReadNegativeStrandFlag()) {
                                    left_clip_length += ce.getLength();
                                } else {
                                    right_clip_length += ce.getLength();
                                }
                            }
                            total_bases_count += ce.getLength();
                            break;
                        }
                    default:
                        {
                            onLeft = false;
                            if (op.consumesReadBases()) {
                                total_bases_count += ce.getLength();
                            }
                            break;
                        }
                }
            }
            if (left_clip_length + right_clip_length == 0) {
                count_unclipped_reads++;
            } else {
                if (left_clip_length > 0)
                    count_clipped_left_reads++;
                if (right_clip_length > 0)
                    count_clipped_right_reads++;
                count_clipped_reads++;
            }
            switch(type) {
                case leftclip:
                    counter.incr(left_clip_length);
                    break;
                case rightclip:
                    counter.incr(right_clip_length);
                    break;
                case allclip:
                    counter.incr(left_clip_length + right_clip_length);
                    break;
                case deletion:
                    counter.incr(deletion_D_length + deletion_N_length);
                    break;
                case insert:
                    counter.incr(insertion_length);
                    break;
                default:
                    LOG.error("Bad type: " + type);
                    return -1;
            }
        }
        progress.finish();
        pw.println("##UNMAPPED_READS=" + count_unmapped_reads);
        pw.println("##MAPPED_READS=" + (count_clipped_reads + count_unclipped_reads));
        pw.println("##CLIPPED_READS=" + count_clipped_reads);
        pw.println("##CLIPPED_READS_5_PRIME=" + count_clipped_left_reads);
        pw.println("##CLIPPED_READS_3_PRIME=" + count_clipped_right_reads);
        pw.println("##UNCLIPPED_READS=" + count_unclipped_reads);
        pw.println("##COUNT_BASES=" + total_bases_count);
        pw.print("#");
        switch(type) {
            case leftclip:
                pw.print("CLIP_5_PRIME");
                break;
            case rightclip:
                pw.print("CLIP_3_PRIME");
                break;
            case allclip:
                pw.print("CLIP");
                break;
            case deletion:
                pw.print("DELETION");
                break;
            case insert:
                pw.print("INSERTION");
                break;
            default:
                LOG.error("Bad type: " + type);
                return -1;
        }
        pw.println("\tCOUNT\tFRACTION_OF_MAPPED_READS");
        for (final Integer size : new TreeSet<Integer>(counter.keySet())) {
            pw.print(size);
            pw.print('\t');
            pw.print(counter.count(size));
            pw.print('\t');
            pw.println(counter.count(size) / (double) (count_unclipped_reads + count_unclipped_reads));
        }
        pw.flush();
        pw.close();
        pw = null;
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(sfr);
        CloserUtil.close(pw);
    }
}
Also used : SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) CigarOperator(htsjdk.samtools.CigarOperator) CigarElement(htsjdk.samtools.CigarElement) SamReader(htsjdk.samtools.SamReader) Counter(com.github.lindenb.jvarkit.util.Counter) Cigar(htsjdk.samtools.Cigar) TreeSet(java.util.TreeSet) SAMRecord(htsjdk.samtools.SAMRecord) PrintWriter(java.io.PrintWriter)

Aggregations

Counter (com.github.lindenb.jvarkit.util.Counter)17 SAMRecord (htsjdk.samtools.SAMRecord)8 ArrayList (java.util.ArrayList)8 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)7 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)7 Genotype (htsjdk.variant.variantcontext.Genotype)7 VariantContext (htsjdk.variant.variantcontext.VariantContext)7 File (java.io.File)7 HashSet (java.util.HashSet)7 SamReader (htsjdk.samtools.SamReader)6 VCFHeader (htsjdk.variant.vcf.VCFHeader)6 List (java.util.List)6 Set (java.util.Set)6 Parameter (com.beust.jcommander.Parameter)5 Program (com.github.lindenb.jvarkit.util.jcommander.Program)5 Logger (com.github.lindenb.jvarkit.util.log.Logger)5 Cigar (htsjdk.samtools.Cigar)5 CigarElement (htsjdk.samtools.CigarElement)5 CigarOperator (htsjdk.samtools.CigarOperator)5 SAMRecordIterator (htsjdk.samtools.SAMRecordIterator)5