Search in sources :

Example 1 with FisherExactTest

use of com.github.lindenb.jvarkit.math.stats.FisherExactTest in project jvarkit by lindenb.

the class GroupByGene method read.

private void read(final String input) throws IOException {
    LineIterator lineiter = null;
    SortingCollection<Call> sortingCollection = null;
    try {
        final Pattern regexType = (StringUtil.isBlank(this.typeRegexExclude) ? null : Pattern.compile(this.typeRegexExclude));
        lineiter = (input == null ? IOUtils.openStreamForLineIterator(stdin()) : IOUtils.openURIForLineIterator(input));
        sortingCollection = SortingCollection.newInstance(Call.class, new CallCodec(), (C1, C2) -> {
            int i = C1.compareTo(C2);
            if (i != 0)
                return i;
            return C1.line.compareTo(C2.line);
        }, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        sortingCollection.setDestructiveIteration(true);
        final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(lineiter);
        final VCFHeader header = cah.header;
        this.the_dictionary = header.getSequenceDictionary();
        if (this.the_dictionary == null || this.the_dictionary.isEmpty()) {
            throw new JvarkitException.DictionaryMissing(input);
        }
        this.the_codec = cah.codec;
        final List<String> sampleNames;
        if (header.getSampleNamesInOrder() != null) {
            sampleNames = header.getSampleNamesInOrder();
        } else {
            sampleNames = Collections.emptyList();
        }
        final VcfTools vcfTools = new VcfTools(header);
        final Pedigree pedigree;
        if (this.pedigreeFile != null) {
            pedigree = Pedigree.newParser().parse(this.pedigreeFile);
        } else {
            pedigree = Pedigree.newParser().parse(header);
        }
        final Pattern tab = Pattern.compile("[\t]");
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(the_dictionary).logger(LOG);
        while (lineiter.hasNext()) {
            String line = lineiter.next();
            final VariantContext ctx = progress.watch(this.the_codec.decode(line));
            if (!ctx.isVariant())
                continue;
            if (ignore_filtered && ctx.isFiltered())
                continue;
            // simplify line
            final String[] tokens = tab.split(line);
            // ID
            tokens[2] = VCFConstants.EMPTY_ID_FIELD;
            // QUAL
            tokens[5] = VCFConstants.MISSING_VALUE_v4;
            // FILTER
            tokens[6] = VCFConstants.UNFILTERED;
            // INFO
            tokens[7] = VCFConstants.EMPTY_INFO_FIELD;
            line = String.join(VCFConstants.FIELD_SEPARATOR, Arrays.asList(tokens));
            for (final GeneName g : getGenes(vcfTools, ctx)) {
                if (regexType != null && regexType.matcher(g.type).matches())
                    continue;
                final Call c = new Call();
                c.line = line;
                c.gene = g;
                sortingCollection.add(c);
            }
        }
        CloserUtil.close(lineiter);
        lineiter = null;
        sortingCollection.doneAdding();
        /**
         * dump
         */
        final Set<String> casesSamples = pedigree.getPersons().stream().filter(P -> P.isAffected()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
        final Set<String> controlsSamples = pedigree.getPersons().stream().filter(P -> P.isUnaffected()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
        final Set<String> maleSamples = pedigree.getPersons().stream().filter(P -> P.isMale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
        final Set<String> femaleSamples = pedigree.getPersons().stream().filter(P -> P.isFemale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
        final Predicate<Genotype> genotypeFilter = genotype -> {
            if (!genotype.isAvailable())
                return false;
            if (!genotype.isCalled())
                return false;
            if (genotype.isNoCall())
                return false;
            if (genotype.isHomRef())
                return false;
            if (this.ignore_filtered_genotype && genotype.isFiltered())
                return false;
            return true;
        };
        PrintStream pw = openFileOrStdoutAsPrintStream(this.outFile);
        pw.print("#chrom");
        pw.print('\t');
        pw.print("min.POS");
        pw.print('\t');
        pw.print("max.POS");
        pw.print('\t');
        pw.print("gene.name");
        pw.print('\t');
        pw.print("gene.type");
        pw.print('\t');
        pw.print("samples.affected");
        pw.print('\t');
        pw.print("count.variations");
        if (!casesSamples.isEmpty()) {
            pw.print('\t');
            pw.print("pedigree.cases");
        }
        if (!controlsSamples.isEmpty()) {
            pw.print('\t');
            pw.print("pedigree.controls");
        }
        if (!maleSamples.isEmpty()) {
            pw.print('\t');
            pw.print("pedigree.males");
        }
        if (!femaleSamples.isEmpty()) {
            pw.print('\t');
            pw.print("pedigree.females");
        }
        if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
            pw.print('\t');
            pw.print("fisher");
        }
        for (final String sample : sampleNames) {
            pw.print('\t');
            pw.print(sample);
        }
        pw.println();
        final CloseableIterator<Call> iter = sortingCollection.iterator();
        final EqualRangeIterator<Call> eqiter = new EqualRangeIterator<>(iter, (C1, C2) -> C1.compareTo(C2));
        while (eqiter.hasNext()) {
            final List<Call> row = eqiter.next();
            final Call first = row.get(0);
            final List<VariantContext> variantList = row.stream().map(R -> GroupByGene.this.the_codec.decode(R.line)).collect(Collectors.toList());
            final int minPos = variantList.stream().mapToInt(R -> R.getStart()).min().getAsInt();
            final int maxPos = variantList.stream().mapToInt(R -> R.getEnd()).max().getAsInt();
            final Set<String> sampleCarryingMut = new HashSet<String>();
            final Counter<String> pedCasesCarryingMut = new Counter<String>();
            final Counter<String> pedCtrlsCarryingMut = new Counter<String>();
            final Counter<String> malesCarryingMut = new Counter<String>();
            final Counter<String> femalesCarryingMut = new Counter<String>();
            final Counter<String> sample2count = new Counter<String>();
            for (final VariantContext ctx : variantList) {
                for (final Genotype genotype : ctx.getGenotypes()) {
                    if (!genotypeFilter.test(genotype))
                        continue;
                    final String sampleName = genotype.getSampleName();
                    sample2count.incr(sampleName);
                    sampleCarryingMut.add(sampleName);
                    if (casesSamples.contains(sampleName)) {
                        pedCasesCarryingMut.incr(sampleName);
                    }
                    if (controlsSamples.contains(sampleName)) {
                        pedCtrlsCarryingMut.incr(sampleName);
                    }
                    if (maleSamples.contains(sampleName)) {
                        malesCarryingMut.incr(sampleName);
                    }
                    if (femaleSamples.contains(sampleName)) {
                        femalesCarryingMut.incr(sampleName);
                    }
                }
            }
            pw.print(first.getContig());
            pw.print('\t');
            // convert to bed
            pw.print(minPos - 1);
            pw.print('\t');
            pw.print(maxPos);
            pw.print('\t');
            pw.print(first.gene.name);
            pw.print('\t');
            pw.print(first.gene.type);
            pw.print('\t');
            pw.print(sampleCarryingMut.size());
            pw.print('\t');
            pw.print(variantList.size());
            if (!casesSamples.isEmpty()) {
                pw.print('\t');
                pw.print(pedCasesCarryingMut.getCountCategories());
            }
            if (!controlsSamples.isEmpty()) {
                pw.print('\t');
                pw.print(pedCtrlsCarryingMut.getCountCategories());
            }
            if (!maleSamples.isEmpty()) {
                pw.print('\t');
                pw.print(malesCarryingMut.getCountCategories());
            }
            if (!femaleSamples.isEmpty()) {
                pw.print('\t');
                pw.print(femalesCarryingMut.getCountCategories());
            }
            if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
                int count_case_mut = 0;
                int count_ctrl_mut = 0;
                int count_case_wild = 0;
                int count_ctrl_wild = 0;
                for (final VariantContext ctx : variantList) {
                    for (final Genotype genotype : ctx.getGenotypes()) {
                        final String sampleName = genotype.getSampleName();
                        final boolean has_mutation = genotypeFilter.test(genotype);
                        if (controlsSamples.contains(sampleName)) {
                            if (has_mutation) {
                                count_ctrl_mut++;
                            } else {
                                count_ctrl_wild++;
                            }
                        } else if (casesSamples.contains(sampleName)) {
                            if (has_mutation) {
                                count_case_mut++;
                            } else {
                                count_case_wild++;
                            }
                        }
                    }
                }
                final FisherExactTest fisher = FisherExactTest.compute(count_case_mut, count_case_wild, count_ctrl_mut, count_ctrl_wild);
                pw.print('\t');
                pw.print(fisher.getAsDouble());
            }
            for (final String sample : sampleNames) {
                pw.print('\t');
                pw.print(sample2count.count(sample));
            }
            pw.println();
            if (pw.checkError())
                break;
        }
        eqiter.close();
        iter.close();
        pw.flush();
        if (this.outFile != null)
            pw.close();
    } finally {
        CloserUtil.close(lineiter);
        if (sortingCollection != null)
            sortingCollection.cleanup();
    }
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) DataInputStream(java.io.DataInputStream) CloseableIterator(htsjdk.samtools.util.CloseableIterator) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) Arrays(java.util.Arrays) LineIterator(htsjdk.tribble.readers.LineIterator) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) AnnPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) Function(java.util.function.Function) ParametersDelegate(com.beust.jcommander.ParametersDelegate) HashSet(java.util.HashSet) DataOutputStream(java.io.DataOutputStream) StringUtil(htsjdk.samtools.util.StringUtil) FisherExactTest(com.github.lindenb.jvarkit.math.stats.FisherExactTest) AbstractDataCodec(com.github.lindenb.jvarkit.util.picard.AbstractDataCodec) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) VCFConstants(htsjdk.variant.vcf.VCFConstants) CloserUtil(htsjdk.samtools.util.CloserUtil) SnpEffPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.SnpEffPredictionParser) PrintStream(java.io.PrintStream) SortingCollection(htsjdk.samtools.util.SortingCollection) Counter(com.github.lindenb.jvarkit.util.Counter) AbstractVCFCodec(htsjdk.variant.vcf.AbstractVCFCodec) Predicate(java.util.function.Predicate) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) File(java.io.File) List(java.util.List) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VariantContext(htsjdk.variant.variantcontext.VariantContext) VcfTools(com.github.lindenb.jvarkit.util.vcf.VcfTools) Pattern(java.util.regex.Pattern) Comparator(java.util.Comparator) Collections(java.util.Collections) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) VariantContext(htsjdk.variant.variantcontext.VariantContext) LineIterator(htsjdk.tribble.readers.LineIterator) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) Counter(com.github.lindenb.jvarkit.util.Counter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) Pattern(java.util.regex.Pattern) PrintStream(java.io.PrintStream) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) Genotype(htsjdk.variant.variantcontext.Genotype) FisherExactTest(com.github.lindenb.jvarkit.math.stats.FisherExactTest) VcfTools(com.github.lindenb.jvarkit.util.vcf.VcfTools) Pedigree(com.github.lindenb.jvarkit.util.Pedigree)

Aggregations

Parameter (com.beust.jcommander.Parameter)1 ParametersDelegate (com.beust.jcommander.ParametersDelegate)1 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)1 JvarkitException (com.github.lindenb.jvarkit.lang.JvarkitException)1 FisherExactTest (com.github.lindenb.jvarkit.math.stats.FisherExactTest)1 Counter (com.github.lindenb.jvarkit.util.Counter)1 Pedigree (com.github.lindenb.jvarkit.util.Pedigree)1 EqualRangeIterator (com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator)1 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)1 Program (com.github.lindenb.jvarkit.util.jcommander.Program)1 Logger (com.github.lindenb.jvarkit.util.log.Logger)1 AbstractDataCodec (com.github.lindenb.jvarkit.util.picard.AbstractDataCodec)1 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)1 VCFUtils (com.github.lindenb.jvarkit.util.vcf.VCFUtils)1 VcfTools (com.github.lindenb.jvarkit.util.vcf.VcfTools)1 AnnPredictionParser (com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser)1 SnpEffPredictionParser (com.github.lindenb.jvarkit.util.vcf.predictions.SnpEffPredictionParser)1 VepPredictionParser (com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser)1 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)1 CloseableIterator (htsjdk.samtools.util.CloseableIterator)1