Search in sources :

Example 11 with Counter

use of com.github.lindenb.jvarkit.util.Counter in project jvarkit by lindenb.

the class NgsFilesScanner method getFastqSampleInDirectory.

private Counter<String> getFastqSampleInDirectory(File f) {
    if (f == null || !f.isDirectory() || !f.exists() || !f.canRead())
        return EMPTY_COUNTER;
    File[] array = f.listFiles(this.fastqFilter);
    if (array == null)
        return EMPTY_COUNTER;
    Counter<String> fastqSamples = new Counter<String>();
    for (File f2 : array) {
        FastQName fqName = FastQName.parse(f2);
        if (fqName.isValid() && fqName.getSample() != null) {
            fastqSamples.incr(fqName.getSample(), f2.length());
        }
    }
    return fastqSamples;
}
Also used : FastQName(com.github.lindenb.jvarkit.util.illumina.FastQName) Counter(com.github.lindenb.jvarkit.util.Counter) File(java.io.File)

Example 12 with Counter

use of com.github.lindenb.jvarkit.util.Counter in project jvarkit by lindenb.

the class GroupByGene method read.

private void read(final String input) throws IOException {
    LineIterator lineiter = null;
    SortingCollection<Call> sortingCollection = null;
    try {
        final Pattern regexType = (StringUtil.isBlank(this.typeRegexExclude) ? null : Pattern.compile(this.typeRegexExclude));
        lineiter = (input == null ? IOUtils.openStreamForLineIterator(stdin()) : IOUtils.openURIForLineIterator(input));
        sortingCollection = SortingCollection.newInstance(Call.class, new CallCodec(), (C1, C2) -> {
            int i = C1.compareTo(C2);
            if (i != 0)
                return i;
            return C1.line.compareTo(C2.line);
        }, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        sortingCollection.setDestructiveIteration(true);
        final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(lineiter);
        final VCFHeader header = cah.header;
        this.the_dictionary = header.getSequenceDictionary();
        if (this.the_dictionary == null || this.the_dictionary.isEmpty()) {
            throw new JvarkitException.DictionaryMissing(input);
        }
        this.the_codec = cah.codec;
        final List<String> sampleNames;
        if (header.getSampleNamesInOrder() != null) {
            sampleNames = header.getSampleNamesInOrder();
        } else {
            sampleNames = Collections.emptyList();
        }
        final VcfTools vcfTools = new VcfTools(header);
        final Pedigree pedigree;
        if (this.pedigreeFile != null) {
            pedigree = Pedigree.newParser().parse(this.pedigreeFile);
        } else {
            pedigree = Pedigree.newParser().parse(header);
        }
        final Pattern tab = Pattern.compile("[\t]");
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(the_dictionary).logger(LOG);
        while (lineiter.hasNext()) {
            String line = lineiter.next();
            final VariantContext ctx = progress.watch(this.the_codec.decode(line));
            if (!ctx.isVariant())
                continue;
            if (ignore_filtered && ctx.isFiltered())
                continue;
            // simplify line
            final String[] tokens = tab.split(line);
            // ID
            tokens[2] = VCFConstants.EMPTY_ID_FIELD;
            // QUAL
            tokens[5] = VCFConstants.MISSING_VALUE_v4;
            // FILTER
            tokens[6] = VCFConstants.UNFILTERED;
            // INFO
            tokens[7] = VCFConstants.EMPTY_INFO_FIELD;
            line = String.join(VCFConstants.FIELD_SEPARATOR, Arrays.asList(tokens));
            for (final GeneName g : getGenes(vcfTools, ctx)) {
                if (regexType != null && regexType.matcher(g.type).matches())
                    continue;
                final Call c = new Call();
                c.line = line;
                c.gene = g;
                sortingCollection.add(c);
            }
        }
        CloserUtil.close(lineiter);
        lineiter = null;
        sortingCollection.doneAdding();
        /**
         * dump
         */
        final Set<String> casesSamples = pedigree.getPersons().stream().filter(P -> P.isAffected()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
        final Set<String> controlsSamples = pedigree.getPersons().stream().filter(P -> P.isUnaffected()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
        final Set<String> maleSamples = pedigree.getPersons().stream().filter(P -> P.isMale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
        final Set<String> femaleSamples = pedigree.getPersons().stream().filter(P -> P.isFemale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
        final Predicate<Genotype> genotypeFilter = genotype -> {
            if (!genotype.isAvailable())
                return false;
            if (!genotype.isCalled())
                return false;
            if (genotype.isNoCall())
                return false;
            if (genotype.isHomRef())
                return false;
            if (this.ignore_filtered_genotype && genotype.isFiltered())
                return false;
            return true;
        };
        PrintStream pw = openFileOrStdoutAsPrintStream(this.outFile);
        pw.print("#chrom");
        pw.print('\t');
        pw.print("min.POS");
        pw.print('\t');
        pw.print("max.POS");
        pw.print('\t');
        pw.print("gene.name");
        pw.print('\t');
        pw.print("gene.type");
        pw.print('\t');
        pw.print("samples.affected");
        pw.print('\t');
        pw.print("count.variations");
        if (!casesSamples.isEmpty()) {
            pw.print('\t');
            pw.print("pedigree.cases");
        }
        if (!controlsSamples.isEmpty()) {
            pw.print('\t');
            pw.print("pedigree.controls");
        }
        if (!maleSamples.isEmpty()) {
            pw.print('\t');
            pw.print("pedigree.males");
        }
        if (!femaleSamples.isEmpty()) {
            pw.print('\t');
            pw.print("pedigree.females");
        }
        if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
            pw.print('\t');
            pw.print("fisher");
        }
        for (final String sample : sampleNames) {
            pw.print('\t');
            pw.print(sample);
        }
        pw.println();
        final CloseableIterator<Call> iter = sortingCollection.iterator();
        final EqualRangeIterator<Call> eqiter = new EqualRangeIterator<>(iter, (C1, C2) -> C1.compareTo(C2));
        while (eqiter.hasNext()) {
            final List<Call> row = eqiter.next();
            final Call first = row.get(0);
            final List<VariantContext> variantList = row.stream().map(R -> GroupByGene.this.the_codec.decode(R.line)).collect(Collectors.toList());
            final int minPos = variantList.stream().mapToInt(R -> R.getStart()).min().getAsInt();
            final int maxPos = variantList.stream().mapToInt(R -> R.getEnd()).max().getAsInt();
            final Set<String> sampleCarryingMut = new HashSet<String>();
            final Counter<String> pedCasesCarryingMut = new Counter<String>();
            final Counter<String> pedCtrlsCarryingMut = new Counter<String>();
            final Counter<String> malesCarryingMut = new Counter<String>();
            final Counter<String> femalesCarryingMut = new Counter<String>();
            final Counter<String> sample2count = new Counter<String>();
            for (final VariantContext ctx : variantList) {
                for (final Genotype genotype : ctx.getGenotypes()) {
                    if (!genotypeFilter.test(genotype))
                        continue;
                    final String sampleName = genotype.getSampleName();
                    sample2count.incr(sampleName);
                    sampleCarryingMut.add(sampleName);
                    if (casesSamples.contains(sampleName)) {
                        pedCasesCarryingMut.incr(sampleName);
                    }
                    if (controlsSamples.contains(sampleName)) {
                        pedCtrlsCarryingMut.incr(sampleName);
                    }
                    if (maleSamples.contains(sampleName)) {
                        malesCarryingMut.incr(sampleName);
                    }
                    if (femaleSamples.contains(sampleName)) {
                        femalesCarryingMut.incr(sampleName);
                    }
                }
            }
            pw.print(first.getContig());
            pw.print('\t');
            // convert to bed
            pw.print(minPos - 1);
            pw.print('\t');
            pw.print(maxPos);
            pw.print('\t');
            pw.print(first.gene.name);
            pw.print('\t');
            pw.print(first.gene.type);
            pw.print('\t');
            pw.print(sampleCarryingMut.size());
            pw.print('\t');
            pw.print(variantList.size());
            if (!casesSamples.isEmpty()) {
                pw.print('\t');
                pw.print(pedCasesCarryingMut.getCountCategories());
            }
            if (!controlsSamples.isEmpty()) {
                pw.print('\t');
                pw.print(pedCtrlsCarryingMut.getCountCategories());
            }
            if (!maleSamples.isEmpty()) {
                pw.print('\t');
                pw.print(malesCarryingMut.getCountCategories());
            }
            if (!femaleSamples.isEmpty()) {
                pw.print('\t');
                pw.print(femalesCarryingMut.getCountCategories());
            }
            if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
                int count_case_mut = 0;
                int count_ctrl_mut = 0;
                int count_case_wild = 0;
                int count_ctrl_wild = 0;
                for (final VariantContext ctx : variantList) {
                    for (final Genotype genotype : ctx.getGenotypes()) {
                        final String sampleName = genotype.getSampleName();
                        final boolean has_mutation = genotypeFilter.test(genotype);
                        if (controlsSamples.contains(sampleName)) {
                            if (has_mutation) {
                                count_ctrl_mut++;
                            } else {
                                count_ctrl_wild++;
                            }
                        } else if (casesSamples.contains(sampleName)) {
                            if (has_mutation) {
                                count_case_mut++;
                            } else {
                                count_case_wild++;
                            }
                        }
                    }
                }
                final FisherExactTest fisher = FisherExactTest.compute(count_case_mut, count_case_wild, count_ctrl_mut, count_ctrl_wild);
                pw.print('\t');
                pw.print(fisher.getAsDouble());
            }
            for (final String sample : sampleNames) {
                pw.print('\t');
                pw.print(sample2count.count(sample));
            }
            pw.println();
            if (pw.checkError())
                break;
        }
        eqiter.close();
        iter.close();
        pw.flush();
        if (this.outFile != null)
            pw.close();
    } finally {
        CloserUtil.close(lineiter);
        if (sortingCollection != null)
            sortingCollection.cleanup();
    }
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) DataInputStream(java.io.DataInputStream) CloseableIterator(htsjdk.samtools.util.CloseableIterator) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) Arrays(java.util.Arrays) LineIterator(htsjdk.tribble.readers.LineIterator) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) AnnPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) Function(java.util.function.Function) ParametersDelegate(com.beust.jcommander.ParametersDelegate) HashSet(java.util.HashSet) DataOutputStream(java.io.DataOutputStream) StringUtil(htsjdk.samtools.util.StringUtil) FisherExactTest(com.github.lindenb.jvarkit.math.stats.FisherExactTest) AbstractDataCodec(com.github.lindenb.jvarkit.util.picard.AbstractDataCodec) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) VCFConstants(htsjdk.variant.vcf.VCFConstants) CloserUtil(htsjdk.samtools.util.CloserUtil) SnpEffPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.SnpEffPredictionParser) PrintStream(java.io.PrintStream) SortingCollection(htsjdk.samtools.util.SortingCollection) Counter(com.github.lindenb.jvarkit.util.Counter) AbstractVCFCodec(htsjdk.variant.vcf.AbstractVCFCodec) Predicate(java.util.function.Predicate) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) File(java.io.File) List(java.util.List) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VariantContext(htsjdk.variant.variantcontext.VariantContext) VcfTools(com.github.lindenb.jvarkit.util.vcf.VcfTools) Pattern(java.util.regex.Pattern) Comparator(java.util.Comparator) Collections(java.util.Collections) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) VariantContext(htsjdk.variant.variantcontext.VariantContext) LineIterator(htsjdk.tribble.readers.LineIterator) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) Counter(com.github.lindenb.jvarkit.util.Counter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) Pattern(java.util.regex.Pattern) PrintStream(java.io.PrintStream) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) Genotype(htsjdk.variant.variantcontext.Genotype) FisherExactTest(com.github.lindenb.jvarkit.math.stats.FisherExactTest) VcfTools(com.github.lindenb.jvarkit.util.vcf.VcfTools) Pedigree(com.github.lindenb.jvarkit.util.Pedigree)

Example 13 with Counter

use of com.github.lindenb.jvarkit.util.Counter in project jvarkit by lindenb.

the class MsaToVcf method doWork.

@Override
public int doWork(final List<String> args) {
    VariantContextWriter w = null;
    LineIterator r = null;
    try {
        final String inputName = oneFileOrNull(args);
        if (inputName == null) {
            LOG.info("Reading from stdin");
            r = IOUtils.openStreamForLineIterator(stdin());
        } else {
            LOG.info("Reading from " + inputName);
            r = IOUtils.openURIForLineIterator(inputName);
        }
        Format format = Format.None;
        /**
         * try to guess format
         */
        while (r.hasNext() && format == Format.None) {
            final String line = r.peek();
            if (line.trim().isEmpty()) {
                r.next();
                continue;
            }
            if (line.startsWith("CLUSTAL")) {
                format = Format.Clustal;
                // consume
                r.next();
                break;
            } else if (line.startsWith(">")) {
                format = Format.Fasta;
                break;
            } else {
                LOG.error("MSA format not recognized in " + line);
                return -1;
            }
        }
        LOG.info("format : " + format);
        /**
         * parse lines as FASTA
         */
        if (Format.Fasta.equals(format)) {
            this.consensus = new FastaConsensus();
            AlignSequence curr = null;
            while (r.hasNext()) {
                String line = r.next();
                if (line.startsWith(">")) {
                    curr = new AlignSequence();
                    curr.name = line.substring(1).trim();
                    if (sample2sequence.containsKey(curr.name)) {
                        LOG.error("Sequence ID " + curr.name + " defined twice");
                        return -1;
                    }
                    sample2sequence.put(curr.name, curr);
                } else if (curr != null) {
                    curr.seq.append(line.trim());
                    this.align_length = Math.max(this.align_length, curr.seq.length());
                }
            }
        /*
				//remove heading & trailing '-'
				for(final String sample:this.sample2sequence.keySet())
					{
					final AlignSequence seq = this.sample2sequence.get(sample);
					int i=0;
					while(i<this.align_length && seq.at(i)==DELETION)
						{
						seq.seq.setCharAt(i, CLIPPING);
						++i;
						}
					i= this.align_length-1;
					while(i>=0 && seq.at(i)==DELETION)
						{
						seq.seq.setCharAt(i, CLIPPING);
						--i;
						}
					}*/
        } else /**
         * parse lines as CLUSTAL
         */
        if (Format.Clustal.equals(format)) {
            ClustalConsensus clustalconsensus = new ClustalConsensus();
            this.consensus = clustalconsensus;
            AlignSequence curr = null;
            int columnStart = -1;
            while (r.hasNext()) {
                String line = r.next();
                if (line.trim().isEmpty() || line.startsWith("CLUSTAL W")) {
                    columnStart = -1;
                    continue;
                }
                if (line.charAt(0) == ' ') {
                    if (columnStart == -1) {
                        LOG.error("illegal consensus line for " + line);
                        return -1;
                    }
                    /* if consensus doesn't exist in the first rows */
                    while (clustalconsensus.seq.length() < (this.align_length - (line.length() - columnStart))) {
                        clustalconsensus.seq.append(" ");
                    }
                    clustalconsensus.seq.append(line.substring(columnStart));
                } else {
                    if (columnStart == -1) {
                        columnStart = line.indexOf(' ');
                        if (columnStart == -1) {
                            LOG.error("no whithespace in " + line);
                            return -1;
                        }
                        while (columnStart < line.length() && line.charAt(columnStart) == ' ') {
                            columnStart++;
                        }
                    }
                    String seqname = line.substring(0, columnStart).trim();
                    curr = this.sample2sequence.get(seqname);
                    if (curr == null) {
                        curr = new AlignSequence();
                        curr.name = seqname;
                        this.sample2sequence.put(curr.name, curr);
                    }
                    int columnEnd = line.length();
                    // remove blanks and digit at the end
                    while (columnEnd - 1 > columnStart && (line.charAt(columnEnd - 1) == ' ' || Character.isDigit(line.charAt(columnEnd - 1)))) {
                        columnEnd--;
                    }
                    curr.seq.append(line.substring(columnStart, columnEnd));
                    this.align_length = Math.max(align_length, curr.seq.length());
                }
            }
        } else {
            LOG.error("Undefined input format");
            return -1;
        }
        CloserUtil.close(r);
        /* sequence consensus was set*/
        if (consensusRefName != null) {
            AlignSequence namedSequence = null;
            if ((namedSequence = sample2sequence.get(consensusRefName)) == null) {
                LOG.error("Cannot find consensus sequence \"" + consensusRefName + "\" in list of sequences: " + this.sample2sequence.keySet().toString());
                return -1;
            }
            this.consensus = new NamedConsensus(namedSequence);
        }
        /**
         * we're done, print VCF
         */
        /**
         * first, print header
         */
        Set<VCFHeaderLine> vcfHeaderLines = new HashSet<VCFHeaderLine>();
        vcfHeaderLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth."));
        vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
        vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth"));
        // super.addMetaData(vcfHeaderLines);
        Map<String, String> mapping = new HashMap<String, String>();
        mapping.put("ID", REF);
        mapping.put("length", String.valueOf(this.align_length));
        vcfHeaderLines.add(new VCFContigHeaderLine(mapping, 1));
        Set<String> samples = new TreeSet<String>(this.sample2sequence.keySet());
        VCFHeader vcfHeader = new VCFHeader(vcfHeaderLines, samples);
        w = super.openVariantContextWriter(this.outputFile);
        w.writeHeader(vcfHeader);
        /**
         * loop over data, print header
         */
        int pos1 = 0;
        while (pos1 < align_length) {
            // is it a real variation or print all sites
            boolean is_variation;
            if (consensus.at(pos1) == MATCH) {
                if (this.printAllSites) {
                    is_variation = false;
                } else {
                    ++pos1;
                    continue;
                }
            } else {
                is_variation = true;
            }
            int pos2 = pos1 + 1;
            // don't extend if no variation and printAllSites
            while (is_variation && pos2 < align_length && consensus.at(pos2) != MATCH) {
                ++pos2;
            }
            boolean is_subsitution = (pos1 + 1 == pos2);
            if (// need pos1>0 because ALT contains prev base.
            is_subsitution && pos1 != 0 && is_variation) {
                for (Sequence seq : this.sample2sequence.values()) {
                    if (seq.at(pos1) == DELETION) {
                        is_subsitution = false;
                        break;
                    }
                }
            }
            Set<Allele> alleles = new HashSet<Allele>();
            VariantContextBuilder vcb = new VariantContextBuilder();
            List<Genotype> genotypes = new ArrayList<Genotype>(samples.size());
            /* longest variant */
            String longest = null;
            Counter<String> countAlleles = new Counter<String>();
            Map<String, String> sample2genotype = new HashMap<String, String>(samples.size());
            String namedConsensusRefAllele = "N";
            /* loop over the sequences of each seample */
            for (String sample : samples) {
                Sequence seq = this.sample2sequence.get(sample);
                String al = null;
                if (is_subsitution) {
                    if (seq.at(pos1) == CLIPPING)
                        continue;
                    al = String.valueOf(seq.at(pos1));
                } else {
                    StringBuilder sb = new StringBuilder(pos2 - pos1);
                    for (// yes -1
                    int i = Math.max(0, pos1 - 1); i < pos2; ++i) {
                        if (seq.at(i) == CLIPPING)
                            continue;
                        if (seq.at(i) == DELETION)
                            continue;
                        sb.append(seq.at(i));
                    }
                    if (sb.length() == 0)
                        continue;
                    al = sb.toString();
                }
                /* did we find the longest allele ?*/
                if (longest == null || longest.length() < al.length()) {
                    // reset count of most frequent, we'll use the longest indel or subst
                    countAlleles = new Counter<String>();
                    longest = al;
                }
                countAlleles.incr(al);
                sample2genotype.put(sample, al);
                /* if consensus sequence name was defined , record this allele for future use */
                if (consensusRefName != null && sample.equals(consensusRefName)) {
                    namedConsensusRefAllele = al;
                }
            }
            if (countAlleles.isEmpty()) {
                if (// printAllSites=false
                printAllSites == false) {
                    continue;
                }
                /* no a real variation, just add a dummy 'N' */
                countAlleles.incr("N");
            }
            String refAllStr;
            if (consensusRefName == null) {
                refAllStr = countAlleles.getMostFrequent();
            } else {
                refAllStr = namedConsensusRefAllele;
            }
            final Allele refAllele = Allele.create(refAllStr.replaceAll("[^ATGCatgc]", "N"), true);
            alleles.add(refAllele);
            /* loop over samples, , build each genotype */
            for (String sample : sample2genotype.keySet()) {
                Allele al = null;
                if (!sample2genotype.containsKey(sample)) {
                // nothing
                } else if (sample2genotype.get(sample).equals(refAllStr)) {
                    al = refAllele;
                } else {
                    al = Allele.create(sample2genotype.get(sample).replaceAll("[^ATGCatgc]", "N"), false);
                    alleles.add(al);
                }
                if (al != null) {
                    final GenotypeBuilder gb = new GenotypeBuilder(sample);
                    final List<Allele> sampleAlleles = new ArrayList<Allele>(2);
                    sampleAlleles.add(al);
                    if (!haploid)
                        sampleAlleles.add(al);
                    gb.alleles(sampleAlleles);
                    gb.DP(1);
                    genotypes.add(gb.make());
                } else {
                    genotypes.add(GenotypeBuilder.createMissing(sample, haploid ? 1 : 2));
                }
            }
            // got to 1-based ref if subst, for indel with use pos(base)-1
            final int start = pos1 + (is_subsitution ? 1 : 0);
            vcb.start(start);
            vcb.stop(start + (refAllStr.length() - 1));
            vcb.chr(REF);
            HashMap<String, Object> atts = new HashMap<String, Object>();
            atts.put(VCFConstants.DEPTH_KEY, genotypes.size());
            vcb.attributes(atts);
            vcb.alleles(alleles);
            vcb.genotypes(genotypes);
            w.add(vcb.make());
            pos1 = pos2;
        }
        w.close();
        if (outFasta != null) {
            final PrintWriter fasta = super.openFileOrStdoutAsPrintWriter(outFasta);
            for (final String sample : samples) {
                fasta.println(">" + sample);
                final Sequence seq = this.sample2sequence.get(sample);
                for (int i = 0; i < align_length; ++i) {
                    fasta.print(seq.at(i));
                }
                fasta.println();
            }
            fasta.println(">CONSENSUS");
            for (int i = 0; i < align_length; ++i) {
                fasta.print(consensus.at(i));
            }
            fasta.println();
            fasta.flush();
            fasta.close();
        }
        LOG.info("Done");
        return RETURN_OK;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(r);
        CloserUtil.close(w);
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) LineIterator(htsjdk.tribble.readers.LineIterator) VCFContigHeaderLine(htsjdk.variant.vcf.VCFContigHeaderLine) Counter(com.github.lindenb.jvarkit.util.Counter) TreeSet(java.util.TreeSet) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) PrintWriter(java.io.PrintWriter) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder)

Example 14 with Counter

use of com.github.lindenb.jvarkit.util.Counter in project jvarkit by lindenb.

the class VCFCompare method doWork.

@Override
public int doWork(final List<String> args) {
    if (args.isEmpty()) {
        LOG.error("VCFs missing.");
        return -1;
    }
    if (args.size() != 2) {
        System.err.println("Illegal number or arguments. Expected two VCFs");
        return -1;
    }
    PrintWriter pw = null;
    XMLStreamWriter w = null;
    InputStream in = null;
    SortingCollection<LineAndFile> variants = null;
    try {
        LineAndFileComparator varcmp = new LineAndFileComparator();
        variants = SortingCollection.newInstance(LineAndFile.class, new LineAndFileCodec(), varcmp, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        variants.setDestructiveIteration(true);
        for (int i = 0; i < 2; ++i) {
            this.inputs[i] = new Input();
            this.inputs[i].codec = VCFUtils.createDefaultVCFCodec();
            this.inputs[i].filename = args.get(i);
            LOG.info("Opening " + this.inputs[i].filename);
            in = IOUtils.openURIForReading(this.inputs[i].filename);
            final LineReader lr = new SynchronousLineReader(in);
            final LineIterator li = new LineIteratorImpl(lr);
            this.inputs[i].header = (VCFHeader) this.inputs[i].codec.readActualHeader(li);
            this.inputs[i].vepPredictionParser = new VepPredictionParserFactory(this.inputs[i].header).get();
            this.inputs[i].snpEffPredictionParser = new SnpEffPredictionParserFactory(this.inputs[i].header).get();
            this.inputs[i].annPredictionParser = new AnnPredictionParserFactory(this.inputs[i].header).get();
            while (li.hasNext()) {
                LineAndFile laf = new LineAndFile();
                laf.fileIdx = i;
                laf.line = li.next();
                variants.add(laf);
            }
            LOG.info("Done Reading " + this.inputs[i].filename);
            CloserUtil.close(li);
            CloserUtil.close(lr);
            CloserUtil.close(in);
        }
        variants.doneAdding();
        LOG.info("Done Adding");
        Set<String> commonSamples = new TreeSet<String>(this.inputs[0].header.getSampleNamesInOrder());
        commonSamples.retainAll(this.inputs[1].header.getSampleNamesInOrder());
        List<Venn0> venn1List = new ArrayList<VCFCompare.Venn0>();
        venn1List.add(new Venn1("ALL"));
        venn1List.add(new Venn1("having ID") {

            @Override
            public VariantContext filter(VariantContext ctx, int fileIndex) {
                return ctx == null || !ctx.hasID() ? null : ctx;
            }
        });
        venn1List.add(new Venn1("QUAL greater 30") {

            @Override
            public VariantContext filter(VariantContext ctx, int fileIndex) {
                return ctx == null || !ctx.hasLog10PError() || ctx.getPhredScaledQual() < 30.0 ? null : ctx;
            }
        });
        for (VariantContext.Type t : VariantContext.Type.values()) {
            venn1List.add(new VennType(t));
        }
        for (SequenceOntologyTree.Term term : SequenceOntologyTree.getInstance().getTerms()) {
            venn1List.add(new VennPred("vep", term) {

                @Override
                Set<Term> terms(VariantContext ctx, int file_id) {
                    Set<Term> tt = new HashSet<SequenceOntologyTree.Term>();
                    for (VepPredictionParser.VepPrediction pred : VCFCompare.this.inputs[file_id].vepPredictionParser.getPredictions(ctx)) {
                        tt.addAll(pred.getSOTerms());
                    }
                    return tt;
                }
            });
            venn1List.add(new VennPred("SnpEff", term) {

                @Override
                Set<Term> terms(VariantContext ctx, int file_id) {
                    Set<Term> tt = new HashSet<SequenceOntologyTree.Term>();
                    for (SnpEffPredictionParser.SnpEffPrediction pred : VCFCompare.this.inputs[file_id].snpEffPredictionParser.getPredictions(ctx)) {
                        tt.addAll(pred.getSOTerms());
                    }
                    return tt;
                }
            });
            venn1List.add(new VennPred("ANN", term) {

                @Override
                Set<Term> terms(VariantContext ctx, int file_id) {
                    Set<Term> tt = new HashSet<SequenceOntologyTree.Term>();
                    for (AnnPredictionParser.AnnPrediction pred : VCFCompare.this.inputs[file_id].annPredictionParser.getPredictions(ctx)) {
                        tt.addAll(pred.getSOTerms());
                    }
                    return tt;
                }
            });
        }
        for (String s : commonSamples) {
            venn1List.add(new VennGType(s));
        }
        /* START : digest results ====================== */
        Counter<String> diff = new Counter<String>();
        List<LineAndFile> row = new ArrayList<LineAndFile>();
        CloseableIterator<LineAndFile> iter = variants.iterator();
        for (; ; ) {
            LineAndFile rec = null;
            if (iter.hasNext()) {
                rec = iter.next();
            }
            if (rec == null || (!row.isEmpty() && varcmp.compare(row.get(0), rec) != 0)) {
                if (!row.isEmpty()) {
                    diff.incr("count.variations");
                    VariantContext[] contexes_init = new VariantContext[] { null, null };
                    for (LineAndFile var : row) {
                        if (contexes_init[var.fileIdx] != null) {
                            LOG.error("Duplicate context in " + inputs[var.fileIdx].filename + " : " + var.line);
                            continue;
                        }
                        contexes_init[var.fileIdx] = var.getContext();
                    }
                    for (Venn0 venn : venn1List) {
                        venn.visit(contexes_init);
                    }
                    row.clear();
                }
                if (rec == null)
                    break;
            }
            row.add(rec);
        }
        iter.close();
        /* END : digest results ====================== */
        pw = super.openFileOrStdoutAsPrintWriter(outputFile);
        XMLOutputFactory xmlfactory = XMLOutputFactory.newInstance();
        w = xmlfactory.createXMLStreamWriter(pw);
        w.writeStartElement("html");
        w.writeStartElement("body");
        /* specific samples */
        w.writeStartElement("div");
        w.writeStartElement("dl");
        for (int i = 0; i < 3; ++i) {
            String title;
            Set<String> samples;
            switch(i) {
                case 0:
                case 1:
                    title = "Sample(s) for " + this.inputs[i].filename + ".";
                    samples = new TreeSet<String>(this.inputs[i].header.getSampleNamesInOrder());
                    samples.removeAll(commonSamples);
                    break;
                default:
                    title = "Common Sample(s).";
                    samples = new TreeSet<String>(commonSamples);
                    break;
            }
            w.writeStartElement("dt");
            w.writeCharacters(title);
            w.writeEndElement();
            w.writeStartElement("dd");
            w.writeStartElement("ol");
            for (String s : samples) {
                w.writeStartElement("li");
                w.writeCharacters(s);
                w.writeEndElement();
            }
            w.writeEndElement();
            w.writeEndElement();
        }
        // dl
        w.writeEndElement();
        // div
        w.writeEndElement();
        for (Venn0 v : venn1List) {
            v.write(w);
        }
        // body
        w.writeEndElement();
        // html
        w.writeEndElement();
        w.writeEndDocument();
        w.close();
        w = null;
        pw.flush();
        pw.close();
        pw = null;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(w);
        CloserUtil.close(pw);
        if (variants != null)
            variants.cleanup();
    }
    return 0;
}
Also used : Term(com.github.lindenb.jvarkit.util.so.SequenceOntologyTree.Term) XMLOutputFactory(javax.xml.stream.XMLOutputFactory) TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) Set(java.util.Set) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) LineIterator(htsjdk.tribble.readers.LineIterator) Counter(com.github.lindenb.jvarkit.util.Counter) XMLStreamWriter(javax.xml.stream.XMLStreamWriter) TreeSet(java.util.TreeSet) SynchronousLineReader(htsjdk.tribble.readers.SynchronousLineReader) LineReader(htsjdk.tribble.readers.LineReader) SynchronousLineReader(htsjdk.tribble.readers.SynchronousLineReader) AnnPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParserFactory) LineIteratorImpl(htsjdk.tribble.readers.LineIteratorImpl) PrintWriter(java.io.PrintWriter) DataInputStream(java.io.DataInputStream) InputStream(java.io.InputStream) SnpEffPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.SnpEffPredictionParserFactory) Term(com.github.lindenb.jvarkit.util.so.SequenceOntologyTree.Term) XMLStreamException(javax.xml.stream.XMLStreamException) IOException(java.io.IOException) SequenceOntologyTree(com.github.lindenb.jvarkit.util.so.SequenceOntologyTree) VepPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory)

Example 15 with Counter

use of com.github.lindenb.jvarkit.util.Counter in project jvarkit by lindenb.

the class VCFCompareGT method doWork.

@Override
public int doWork(final List<String> arguments) {
    final List<File> inputVcfFiles = new ArrayList<>(IOUtil.unrollFiles(arguments.stream().map(F -> new File(F)).collect(Collectors.toCollection(HashSet::new)), ".vcf", "vcf.gz"));
    if (inputVcfFiles.isEmpty()) {
        LOG.error("VCF missing.");
        return -1;
    }
    VariantComparator varcmp = new VariantComparator();
    SortingCollection<Variant> variants = null;
    final Set<String> sampleNames = new LinkedHashSet<>();
    try {
        variants = SortingCollection.newInstance(Variant.class, new VariantCodec(), varcmp, writingSortingCollection.getMaxRecordsInRam(), writingSortingCollection.getTmpPaths());
        variants.setDestructiveIteration(true);
        final Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
        metaData.add(new VCFHeaderLine(getClass().getSimpleName(), "version:" + getVersion() + " command:" + getProgramCommandLine()));
        for (int i = 0; i < inputVcfFiles.size(); ++i) {
            final File vcfFile = inputVcfFiles.get(i);
            LOG.info("Opening " + vcfFile);
            final VCFFileReader vcfFileReader = new VCFFileReader(vcfFile, false);
            final CloseableIterator<VariantContext> iter = vcfFileReader.iterator();
            final VCFHeader header = vcfFileReader.getFileHeader();
            sampleNames.addAll(header.getSampleNamesInOrder());
            metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "_" + ((i) + 1), "File: " + vcfFile.getPath()));
            long nLines = 0;
            while (iter.hasNext()) {
                final VariantContext var = iter.next();
                if (nLines++ % 10000 == 0) {
                    LOG.info(vcfFile + " " + nLines);
                }
                if (!var.isVariant())
                    continue;
                if (!var.hasGenotypes())
                    continue;
                for (final Genotype genotype : var.getGenotypes()) {
                    final Variant rec = new Variant();
                    if (!genotype.isAvailable())
                        continue;
                    if (!genotype.isCalled())
                        continue;
                    if (genotype.isNoCall())
                        continue;
                    rec.file_index = i + 1;
                    rec.sampleName = genotype.getSampleName();
                    rec.chrom = var.getContig();
                    rec.start = var.getStart();
                    rec.end = var.getEnd();
                    rec.ref = var.getReference().getDisplayString();
                    if (var.hasID()) {
                        rec.id = var.getID();
                    }
                    if (genotype.hasDP()) {
                        rec.dp = genotype.getDP();
                    }
                    if (genotype.hasGQ()) {
                        rec.gq = genotype.getGQ();
                    }
                    final List<Allele> alleles = genotype.getAlleles();
                    if (alleles == null)
                        continue;
                    if (alleles.size() == 1) {
                        rec.a1 = alleles.get(0).getDisplayString().toUpperCase();
                        rec.a2 = rec.a1;
                    } else if (alleles.size() == 2) {
                        rec.a1 = alleles.get(0).getDisplayString().toUpperCase();
                        rec.a2 = alleles.get(1).getDisplayString().toUpperCase();
                        if (rec.a1.compareTo(rec.a2) > 0) {
                            String tmp = rec.a2;
                            rec.a2 = rec.a1;
                            rec.a1 = tmp;
                        }
                    } else {
                        continue;
                    }
                    variants.add(rec);
                }
            }
            iter.close();
            vcfFileReader.close();
        }
        variants.doneAdding();
        LOG.info("Done Adding");
        final Set<String> newSampleNames = new HashSet<>();
        for (int i = 0; i < inputVcfFiles.size(); ++i) {
            for (final String sample : sampleNames) {
                newSampleNames.add(sample + "_" + ((i) + 1));
            }
        }
        final String GenpotypeChangedKey = "GCH";
        final String GenpotypeCreated = "GNW";
        final String GenpotypeDiff = "GDF";
        metaData.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
        metaData.add(new VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Depth"));
        metaData.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Qual"));
        metaData.add(new VCFFormatHeaderLine(GenpotypeChangedKey, 1, VCFHeaderLineType.Integer, "Changed Genotype"));
        metaData.add(new VCFFormatHeaderLine(GenpotypeCreated, 1, VCFHeaderLineType.Integer, "Genotype Created/Deleted"));
        metaData.add(new VCFInfoHeaderLine(GenpotypeDiff, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples with Genotype Difference"));
        final VCFHeader header = new VCFHeader(metaData, new ArrayList<String>(newSampleNames));
        final VariantContextWriter w = super.openVariantContextWriter(outputFile);
        w.writeHeader(header);
        final PosComparator posCompare = new PosComparator();
        final EqualRangeIterator<Variant> iter = new EqualRangeIterator<>(variants.iterator(), posCompare);
        while (iter.hasNext()) {
            final List<Variant> row = iter.next();
            /**
             * this sample is not always the same
             */
            final Set<String> samplesModified = new TreeSet<>();
            /**
             * the number of sample is different from vcflist.size()
             */
            final Set<String> samplesCreates = new TreeSet<>();
            final Counter<String> samplesSeen = new Counter<>();
            for (int x = 0; x < row.size(); ++x) {
                final Variant var1 = row.get(x);
                samplesSeen.incr(var1.sampleName);
                for (int y = x + 1; y < row.size(); ++y) {
                    final Variant var2 = row.get(y);
                    if (!var2.sampleName.equals(var1.sampleName))
                        continue;
                    if (var1.a1.equals(var2.a1) && var1.a2.equals(var2.a2))
                        continue;
                    samplesModified.add(var1.sampleName);
                }
            }
            for (final String sampleName : samplesSeen.keySet()) {
                if (samplesSeen.count(sampleName) != inputVcfFiles.size()) {
                    samplesCreates.add(sampleName);
                }
            }
            final Variant first = row.get(0);
            final Set<Allele> alleles = new HashSet<>();
            alleles.add(Allele.create(first.ref, true));
            for (final Variant var : row) {
                alleles.add(Allele.create(var.a1, var.a1.equalsIgnoreCase(var.ref)));
                alleles.add(Allele.create(var.a2, var.a2.equalsIgnoreCase(var.ref)));
            }
            final VariantContextBuilder b = new VariantContextBuilder(getClass().getName(), first.chrom, first.start, first.end, alleles);
            // build genotypes
            final List<Genotype> genotypes = new ArrayList<Genotype>();
            for (final Variant var : row) {
                // alleles for this genotype
                final List<Allele> galleles = new ArrayList<Allele>();
                galleles.add(Allele.create(var.a1, var.a1.equalsIgnoreCase(var.ref)));
                galleles.add(Allele.create(var.a2, var.a2.equalsIgnoreCase(var.ref)));
                final GenotypeBuilder gb = new GenotypeBuilder();
                gb.DP(var.dp);
                gb.alleles(galleles);
                gb.name(var.sampleName + "_" + var.file_index);
                gb.GQ(var.gq);
                gb.attribute(GenpotypeChangedKey, samplesModified.contains(var.sampleName) ? 1 : 0);
                gb.attribute(GenpotypeCreated, samplesCreates.contains(var.sampleName) ? 1 : 0);
                genotypes.add(gb.make());
            }
            b.genotypes(genotypes);
            b.id(first.id);
            if (!(samplesModified.isEmpty() && samplesCreates.isEmpty())) {
                Set<String> set2 = new TreeSet<String>(samplesModified);
                set2.addAll(samplesCreates);
                b.attribute(GenpotypeDiff, set2.toArray());
            }
            if (!only_print_modified || !(samplesModified.isEmpty() && samplesCreates.isEmpty())) {
                w.add(b.make());
            }
        }
        iter.close();
        w.close();
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        if (variants != null)
            try {
                variants.cleanup();
            } catch (Exception err) {
            }
    }
    return 0;
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) DataInputStream(java.io.DataInputStream) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) Program(com.github.lindenb.jvarkit.util.jcommander.Program) IOUtil(htsjdk.samtools.util.IOUtil) Parameter(com.beust.jcommander.Parameter) VCFFileReader(htsjdk.variant.vcf.VCFFileReader) VCFHeader(htsjdk.variant.vcf.VCFHeader) TreeSet(java.util.TreeSet) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) DataOutputStream(java.io.DataOutputStream) AbstractDataCodec(com.github.lindenb.jvarkit.util.picard.AbstractDataCodec) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) LinkedHashSet(java.util.LinkedHashSet) VCFConstants(htsjdk.variant.vcf.VCFConstants) SortingCollection(htsjdk.samtools.util.SortingCollection) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) Counter(com.github.lindenb.jvarkit.util.Counter) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) File(java.io.File) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Comparator(java.util.Comparator) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) LinkedHashSet(java.util.LinkedHashSet) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ArrayList(java.util.ArrayList) VCFFileReader(htsjdk.variant.vcf.VCFFileReader) VariantContext(htsjdk.variant.variantcontext.VariantContext) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) Counter(com.github.lindenb.jvarkit.util.Counter) TreeSet(java.util.TreeSet) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) IOException(java.io.IOException) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) File(java.io.File)

Aggregations

Counter (com.github.lindenb.jvarkit.util.Counter)17 SAMRecord (htsjdk.samtools.SAMRecord)8 ArrayList (java.util.ArrayList)8 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)7 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)7 Genotype (htsjdk.variant.variantcontext.Genotype)7 VariantContext (htsjdk.variant.variantcontext.VariantContext)7 File (java.io.File)7 HashSet (java.util.HashSet)7 SamReader (htsjdk.samtools.SamReader)6 VCFHeader (htsjdk.variant.vcf.VCFHeader)6 List (java.util.List)6 Set (java.util.Set)6 Parameter (com.beust.jcommander.Parameter)5 Program (com.github.lindenb.jvarkit.util.jcommander.Program)5 Logger (com.github.lindenb.jvarkit.util.log.Logger)5 Cigar (htsjdk.samtools.Cigar)5 CigarElement (htsjdk.samtools.CigarElement)5 CigarOperator (htsjdk.samtools.CigarOperator)5 SAMRecordIterator (htsjdk.samtools.SAMRecordIterator)5