Search in sources :

Example 1 with Pedigree

use of com.github.lindenb.jvarkit.util.Pedigree in project jvarkit by lindenb.

the class VcfEpistatis01 method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.number_of_jobs < 1) {
        LOG.error("bad number of jobs");
        return -1;
    }
    try {
        final int variantsCount;
        final List<VariantContext> inMemoryVariants;
        final File vcfFile = new File(oneAndOnlyOneFile(args));
        final File tmpIndexFile;
        if (vcfFile.equals(this.outputFile)) {
            LOG.error("input == output");
            return -1;
        }
        VCFFileReader vcfFileReader = new VCFFileReader(vcfFile, false);
        final VCFHeader header = vcfFileReader.getFileHeader();
        final Pedigree pedigree;
        if (this.pedigreeFile != null) {
            pedigree = new Pedigree.Parser().parse(this.pedigreeFile);
        } else {
            pedigree = new Pedigree.Parser().parse(header);
        }
        pedigree.verifyPersonsHaveUniqueNames();
        final Map<String, Integer> sample2index = header.getSampleNameToOffset();
        final int[] caseIndexes = pedigree.getAffected().stream().filter(P -> sample2index.containsKey(P.getId())).mapToInt(P -> sample2index.get(P.getId())).sorted().toArray();
        final int[] ctrlIndexes = new ArrayList<>(pedigree.getUnaffected()).stream().filter(P -> sample2index.containsKey(P.getId())).mapToInt(P -> sample2index.get(P.getId())).sorted().toArray();
        if (caseIndexes.length == 0 || ctrlIndexes.length == 0) {
            LOG.error("empty ped or no case/ctrl");
            vcfFileReader.close();
            return -1;
        }
        if (this.load_variants_in_memory) {
            LOG.info("loading variants in memory");
            tmpIndexFile = null;
            final CloseableIterator<VariantContext> iter2 = vcfFileReader.iterator();
            inMemoryVariants = Collections.unmodifiableList(iter2.stream().filter(this.variantFilter).filter(// should fix https://github.com/samtools/htsjdk/issues/1026 ?
            V -> V.getGenotypes().stream().filter(G -> G.isCalled()).count() > 0).collect(Collectors.toList()));
            variantsCount = inMemoryVariants.size();
            iter2.close();
        } else {
            tmpIndexFile = File.createTempFile("epistatsis", VcfOffsetsIndexFactory.INDEX_EXTENSION);
            tmpIndexFile.deleteOnExit();
            new VcfOffsetsIndexFactory().setLogger(LOG).setPredicate(variantFilter).indexVcfFile(vcfFile, tmpIndexFile);
            final VcfList tmpList = VcfList.fromFile(vcfFile, tmpIndexFile);
            variantsCount = tmpList.size();
            tmpList.close();
            inMemoryVariants = null;
        }
        vcfFileReader.close();
        LOG.info("Number of variants: " + variantsCount);
        Result bestResult = null;
        int x = this.start_index_at;
        while (x + 1 < variantsCount) {
            final List<Runner> runners = new Vector<>(this.number_of_jobs);
            while (x + 1 < variantsCount && runners.size() < this.number_of_jobs) {
                LOG.info("starting " + x + "/" + variantsCount);
                runners.add(new Runner(inMemoryVariants == null ? VcfList.fromFile(vcfFile, tmpIndexFile) : new Vector<>(inMemoryVariants), x, caseIndexes, ctrlIndexes));
                ++x;
            }
            final ExecutorService execSvc;
            if (this.number_of_jobs == 1) {
                execSvc = null;
            } else {
                execSvc = Executors.newFixedThreadPool(this.number_of_jobs);
                ;
            }
            if (this.number_of_jobs == 1) {
                runners.get(0).call();
            } else {
                execSvc.invokeAll(runners);
            }
            if (execSvc != null) {
                execSvc.shutdown();
                execSvc.awaitTermination(10000L, TimeUnit.DAYS);
                execSvc.shutdownNow();
            }
            runners.stream().mapToLong(R -> R.duration).min().ifPresent(D -> {
                LOG.info("That took " + (D / 1000f) + " seconds.");
            });
            for (final Runner r : runners) {
                final Result rez = r.result;
                if (rez == null)
                    continue;
                if (bestResult == null || bestResult.score < rez.score) {
                    bestResult = rez;
                    if (this.output_score) {
                        final PrintWriter pw = super.openFileOrStdoutAsPrintWriter(this.outputFile);
                        pw.println(bestResult.score + "\t" + bestResult.toString());
                        pw.flush();
                        pw.close();
                    } else {
                        final VariantContextWriter w = openVariantContextWriter(this.outputFile);
                        final VCFHeader header2 = new VCFHeader(header);
                        header2.addMetaDataLine(new VCFHeaderLine(VcfEpistatis01.class.getName(), bestResult.toString()));
                        w.writeHeader(header2);
                        w.add(bestResult.ctx1);
                        w.add(bestResult.ctx2);
                        w.close();
                    }
                }
            }
            LOG.info("best: " + bestResult);
        }
        if (tmpIndexFile != null)
            tmpIndexFile.delete();
        return 0;
    } catch (final Exception err) {
        err.printStackTrace();
        LOG.error(err);
        return -1;
    } finally {
    }
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) JexlVariantPredicate(com.github.lindenb.jvarkit.util.vcf.JexlVariantPredicate) Parameter(com.beust.jcommander.Parameter) VCFFileReader(htsjdk.variant.vcf.VCFFileReader) VCFHeader(htsjdk.variant.vcf.VCFHeader) Callable(java.util.concurrent.Callable) Function(java.util.function.Function) ArrayList(java.util.ArrayList) Vector(java.util.Vector) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) Map(java.util.Map) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) ExecutorService(java.util.concurrent.ExecutorService) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) Predicate(java.util.function.Predicate) Logger(com.github.lindenb.jvarkit.util.log.Logger) Collectors(java.util.stream.Collectors) File(java.io.File) Executors(java.util.concurrent.Executors) VcfOffsetsIndexFactory(com.github.lindenb.jvarkit.tools.vcflist.VcfOffsetsIndexFactory) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VariantContext(htsjdk.variant.variantcontext.VariantContext) Collections(java.util.Collections) VcfList(com.github.lindenb.jvarkit.tools.vcflist.VcfList) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) VCFFileReader(htsjdk.variant.vcf.VCFFileReader) VariantContext(htsjdk.variant.variantcontext.VariantContext) VcfOffsetsIndexFactory(com.github.lindenb.jvarkit.tools.vcflist.VcfOffsetsIndexFactory) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) Vector(java.util.Vector) PrintWriter(java.io.PrintWriter) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) ExecutorService(java.util.concurrent.ExecutorService) File(java.io.File) VcfList(com.github.lindenb.jvarkit.tools.vcflist.VcfList)

Example 2 with Pedigree

use of com.github.lindenb.jvarkit.util.Pedigree in project jvarkit by lindenb.

the class VcfGeneEpistasis method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.geneBed == null) {
        LOG.error("gene file bed undefined");
        return -1;
    }
    if (this.outputFile == null) {
        LOG.error("output file undefined");
        return -1;
    }
    CloseableIterator<VariantContext> iter = null;
    try {
        final File vcfFile = new File(oneAndOnlyOneFile(args));
        this.vcfFileReader = new VCFFileReader(vcfFile, true);
        final VCFHeader header = this.vcfFileReader.getFileHeader();
        final Pedigree pedigree;
        if (this.pedigreeFile != null) {
            pedigree = new Pedigree.Parser().parse(this.pedigreeFile);
        } else {
            pedigree = new Pedigree.Parser().parse(header);
        }
        if (pedigree == null || pedigree.isEmpty() || !pedigree.hasAffected() || !pedigree.hasUnaffected()) {
            LOG.error("empty ped or no case/ctrl");
            return -1;
        }
        pedigree.verifyPersonsHaveUniqueNames();
        for (final Pedigree.Person p : pedigree.getPersons().stream().filter(P -> P.isAffected() || P.isUnaffected()).filter(P -> header.getSampleNamesInOrder().contains(P.getId())).collect(Collectors.toSet())) {
            this.id2samples.put(p.getId(), p);
        }
        this.vcfTools = new VcfTools(header);
        List<Interval> geneList;
        if (!this.geneBed.exists()) {
            final Map<String, Interval> gene2interval = new HashMap<>(50000);
            LOG.info("building gene file" + this.geneBed);
            iter = this.vcfFileReader.iterator();
            // iter = this.vcfFileReader.query("chr3",1,300_000_000);
            final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
            while (iter.hasNext()) {
                final VariantContext ctx = progress.watch(iter.next());
                if (!accept(ctx))
                    continue;
                for (final String geneName : getGenes(ctx)) {
                    final Interval old = gene2interval.get(geneName);
                    if (old == null) {
                        gene2interval.put(geneName, new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, geneName));
                        LOG.info("adding " + geneName + ". number of genes: " + gene2interval.size());
                    } else if (!old.getContig().equals(ctx.getContig())) {
                        LOG.error("boum :" + geneName + ": on chrom " + ctx.getContig() + " vs " + old);
                        return -1;
                    } else {
                        gene2interval.put(geneName, new Interval(ctx.getContig(), Math.min(ctx.getStart(), old.getStart()), Math.max(ctx.getEnd(), old.getEnd()), false, geneName));
                    }
                }
            }
            iter.close();
            iter = null;
            progress.finish();
            geneList = new ArrayList<>(gene2interval.values());
            PrintWriter pw = new PrintWriter(this.geneBed);
            for (final Interval g : geneList) {
                pw.println(g.getContig() + "\t" + (g.getStart() - 1) + "\t" + (g.getEnd()) + "\t" + g.getName());
            }
            pw.flush();
            pw.close();
            pw = null;
        } else {
            BedLineCodec codec = new BedLineCodec();
            BufferedReader r = IOUtil.openFileForBufferedReading(geneBed);
            geneList = r.lines().map(L -> codec.decode(L)).filter(B -> B != null).map(B -> new Interval(B.getContig(), B.getStart(), B.getEnd(), true, B.get(3))).collect(Collectors.toList());
            r.close();
        }
        if (geneList.isEmpty()) {
            LOG.error("gene List is empty");
            return -1;
        }
        final Comparator<VariantContext> ctxSorter = VCFUtils.createTidPosRefComparator(header.getSequenceDictionary());
        final Function<Interval, List<VariantContext>> loadVariants = (R) -> {
            List<VariantContext> L = new ArrayList<>();
            CloseableIterator<VariantContext> r = this.vcfFileReader.query(R.getContig(), R.getStart(), R.getEnd());
            while (r.hasNext()) {
                final VariantContext ctx = r.next();
                if (!accept(ctx))
                    continue;
                if (!getGenes(ctx).contains(R.getName()))
                    continue;
                L.add(ctx);
            }
            r.close();
            return L;
        };
        final SkatExecutor executor = this.skatFactory.build();
        Double bestSkat = null;
        LOG.info("number of genes : " + geneList.size());
        final int list_end_index = (this.user_end_index < 0 ? geneList.size() : Math.min(geneList.size(), this.user_end_index));
        for (int x = this.user_begin_index; x < list_end_index; ++x) {
            final Interval intervalx = geneList.get(x);
            final List<VariantContext> variantsx = loadVariants.apply(intervalx);
            if (variantsx.isEmpty())
                continue;
            for (int y = x; y < geneList.size(); /* pas list_end_index */
            ++y) {
                final Interval intervaly;
                final List<VariantContext> merge;
                if (y == x) {
                    // we-re testing gene 1 only
                    intervaly = intervalx;
                    merge = variantsx;
                } else {
                    intervaly = geneList.get(y);
                    if (intervaly.intersects(intervalx))
                        continue;
                    final List<VariantContext> variantsy = loadVariants.apply(intervaly);
                    if (variantsy.isEmpty())
                        continue;
                    merge = new MergedList<>(variantsx, variantsy);
                }
                LOG.info("testing : [" + x + "]" + intervalx + " [" + y + "]" + intervaly + " N:" + geneList.size() + " best: " + bestSkat);
                final Double skat = eval(executor, merge);
                if (skat == null)
                    continue;
                if (bestSkat == null || skat.compareTo(bestSkat) < 0) {
                    bestSkat = skat;
                    LOG.info("best " + bestSkat + " " + intervalx + " " + intervaly);
                    if (this.outputFile.getName().endsWith(".vcf") || this.outputFile.getName().endsWith(".vcf.gz")) {
                        final VCFHeader header2 = new VCFHeader(header);
                        header2.addMetaDataLine(new VCFHeaderLine(VcfGeneEpistasis.class.getName(), intervalx.getName() + " " + intervaly.getName() + " " + bestSkat));
                        final VariantContextWriter w = VCFUtils.createVariantContextWriter(outputFile);
                        w.writeHeader(header2);
                        merge.stream().sorted(ctxSorter).forEach(V -> w.add(V));
                        w.close();
                    } else {
                        final PrintWriter w = super.openFileOrStdoutAsPrintWriter(outputFile);
                        w.println(String.valueOf(bestSkat) + "\t" + intervalx.getName() + "\t" + intervaly.getName());
                        w.flush();
                        w.close();
                    }
                }
            }
        }
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(this.vcfFileReader);
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) Program(com.github.lindenb.jvarkit.util.jcommander.Program) IOUtil(htsjdk.samtools.util.IOUtil) Parameter(com.beust.jcommander.Parameter) VCFFileReader(htsjdk.variant.vcf.VCFFileReader) VCFHeader(htsjdk.variant.vcf.VCFHeader) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) AbstractList(java.util.AbstractList) HashMap(java.util.HashMap) Function(java.util.function.Function) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) Interval(htsjdk.samtools.util.Interval) SkatResult(com.github.lindenb.jvarkit.tools.skat.SkatFactory.SkatResult) StringUtil(htsjdk.samtools.util.StringUtil) SkatExecutor(com.github.lindenb.jvarkit.tools.skat.SkatFactory.SkatExecutor) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) Map(java.util.Map) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SkatFactory(com.github.lindenb.jvarkit.tools.skat.SkatFactory) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) File(java.io.File) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VariantContext(htsjdk.variant.variantcontext.VariantContext) VcfTools(com.github.lindenb.jvarkit.util.vcf.VcfTools) BufferedReader(java.io.BufferedReader) Comparator(java.util.Comparator) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) HashMap(java.util.HashMap) VCFFileReader(htsjdk.variant.vcf.VCFFileReader) VariantContext(htsjdk.variant.variantcontext.VariantContext) AbstractList(java.util.AbstractList) ArrayList(java.util.ArrayList) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) PrintWriter(java.io.PrintWriter) CloseableIterator(htsjdk.samtools.util.CloseableIterator) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) VcfTools(com.github.lindenb.jvarkit.util.vcf.VcfTools) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) BufferedReader(java.io.BufferedReader) SkatExecutor(com.github.lindenb.jvarkit.tools.skat.SkatFactory.SkatExecutor) File(java.io.File) Interval(htsjdk.samtools.util.Interval)

Example 3 with Pedigree

use of com.github.lindenb.jvarkit.util.Pedigree in project jvarkit by lindenb.

the class VcfDoest method run.

private void run(final LineIterator lr, final PrintWriter pw) throws IOException {
    SortingCollection<TranscriptInfo> sorting = null;
    CloseableIterator<TranscriptInfo> iter2 = null;
    try {
        while (lr.hasNext()) {
            VcfIterator in = VCFUtils.createVcfIteratorFromLineIterator(lr, true);
            final VCFHeader header = in.getHeader();
            final Pedigree pedigree = Pedigree.newParser().parse(header);
            if (pedigree.isEmpty()) {
                throw new IOException("No pedigree found in header VCF header. use VcfInjectPedigree to add it");
            }
            final SortedSet<Pedigree.Person> individuals = new TreeSet<>();
            for (final Pedigree.Person individual : pedigree.getPersons()) {
                if (individual.isAffected() || individual.isUnaffected()) {
                    individuals.add(individual);
                }
            }
            boolean first = true;
            pw.println("# samples ( 0: unaffected 1:affected)");
            pw.print("population <- data.frame(family=c(");
            first = true;
            for (final Pedigree.Person person : individuals) {
                if (!first)
                    pw.print(",");
                pw.print("\"" + person.getFamily().getId() + "\"");
                first = false;
            }
            pw.print("),name=c(");
            first = true;
            for (final Pedigree.Person person : individuals) {
                if (!first)
                    pw.print(",");
                pw.print("\"" + person.getId() + "\"");
                first = false;
            }
            pw.print("),status=c(");
            first = true;
            for (final Pedigree.Person person : individuals) {
                if (!first)
                    pw.print(",");
                pw.print(person.isUnaffected() ? 0 : 1);
                first = false;
            }
            pw.println("))");
            sorting = SortingCollection.newInstance(TranscriptInfo.class, new TranscriptInfoCodec(), new TranscriptInfoCmp(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
            sorting.setDestructiveIteration(true);
            final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(header.getSequenceDictionary());
            /**
             * loop over variants
             */
            while (in.hasNext() && !pw.checkError()) {
                final VariantContext ctx = progess.watch(in.next());
                if (ctx.isFiltered())
                    continue;
                if (ctx.getAlternateAlleles().isEmpty())
                    continue;
                final Allele altAllele = ctx.getAltAlleleWithHighestAlleleCount();
                final MafCalculator mafCalculator = new MafCalculator(altAllele, ctx.getContig());
                boolean genotyped = false;
                for (final Pedigree.Person p : pedigree.getPersons()) {
                    if (!(p.isAffected() || p.isUnaffected()))
                        continue;
                    final Genotype g = ctx.getGenotype(p.getId());
                    if (g == null)
                        throw new IOException("Strange I cannot find individual " + p + " in the pedigree. Aborting.");
                    if (g.isCalled()) {
                        mafCalculator.add(g, p.isMale());
                    }
                    if (g.isHet() || g.isHomVar()) {
                        if (!g.getAlleles().contains(altAllele))
                            continue;
                        genotyped = true;
                        break;
                    }
                }
                if (!genotyped)
                    continue;
                final Interval interval = new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd());
                final List<KnownGene> genes = this.overlap(interval);
                if (genes.isEmpty())
                    continue;
                for (final KnownGene kg : genes) {
                    final TranscriptInfo trInfo = new TranscriptInfo();
                    trInfo.contig = kg.getContig();
                    trInfo.txStart = kg.getTxStart();
                    trInfo.txEnd = kg.getTxEnd();
                    trInfo.transcriptName = kg.getName();
                    trInfo.strand = (byte) (kg.isPositiveStrand() ? '+' : '-');
                    trInfo.exonCount = kg.getExonCount();
                    trInfo.transcriptLength = kg.getTranscriptLength();
                    trInfo.ctxStart = ctx.getStart();
                    trInfo.ref = ctx.getReference();
                    trInfo.alt = altAllele;
                    trInfo.maf = mafCalculator.getMaf();
                    trInfo.genotypes = new byte[individuals.size()];
                    int idx = 0;
                    for (final Pedigree.Person individual : individuals) {
                        final Genotype genotype = ctx.getGenotype(individual.getId());
                        final byte b;
                        if (genotype.isHomRef()) {
                            b = 0;
                        } else if (genotype.isHomVar() && genotype.getAlleles().contains(altAllele)) {
                            b = 2;
                        } else if (genotype.isHet() && genotype.getAlleles().contains(altAllele) && genotype.getAlleles().contains(ctx.getReference())) {
                            b = 1;
                        } else /* we treat 0/2 has hom-ref */
                        if (genotype.isHet() && !genotype.getAlleles().contains(altAllele) && genotype.getAlleles().contains(ctx.getReference())) {
                            LOG.warn("Treating " + genotype + " as hom-ref (0) alt=" + altAllele);
                            b = 0;
                        } else /* we treat 2/2 has hom-ref */
                        if (genotype.isHomVar() && !genotype.getAlleles().contains(altAllele)) {
                            LOG.warn("Treating " + genotype + " as hom-ref (0) alt=" + altAllele);
                            b = 0;
                        } else {
                            b = -9;
                        }
                        trInfo.genotypes[idx] = b;
                        ++idx;
                    }
                    KnownGene archetype = kg;
                    /* find gene archetype = longest overlapping */
                    for (final KnownGene kg2 : genes) {
                        if (kg2 == kg)
                            continue;
                        if (archetype.getStrand().equals(kg2.getStrand()) && archetype.getTranscriptLength() < kg2.getTranscriptLength()) {
                            archetype = kg2;
                        }
                    }
                    trInfo.archetypeName = archetype.getName();
                    trInfo.archetypeLength = archetype.getTranscriptLength();
                    boolean ctxWasFoundInExon = false;
                    final int ctxPos0 = ctx.getStart() - 1;
                    int indexInTranscript0 = 0;
                    for (final KnownGene.Exon exon : kg.getExons()) {
                        // variant in exon ?
                        if (!(exon.getStart() > (ctx.getEnd() - 1) || (ctx.getStart() - 1) >= exon.getEnd())) {
                            ctxWasFoundInExon = true;
                            indexInTranscript0 += (ctxPos0 - exon.getStart());
                            if (kg.isNegativeStrand()) {
                                indexInTranscript0 = (kg.getTranscriptLength() - 1) - indexInTranscript0;
                            }
                            trInfo.indexInTranscript0 = indexInTranscript0;
                            trInfo.overlapName = exon.getName();
                            sorting.add(trInfo);
                            break;
                        } else {
                            indexInTranscript0 += (exon.getEnd() - exon.getStart());
                        }
                    }
                    if (ctxWasFoundInExon) {
                        continue;
                    }
                    indexInTranscript0 = 0;
                    // search closest intron/exon junction
                    for (int ex = 0; ex + 1 < kg.getExonCount(); ++ex) {
                        final KnownGene.Exon exon1 = kg.getExon(ex);
                        indexInTranscript0 += (exon1.getEnd() - exon1.getStart());
                        final KnownGene.Exon exon2 = kg.getExon(ex + 1);
                        if (exon1.getEnd() <= ctxPos0 && ctxPos0 < exon2.getStart()) {
                            final int dist_to_exon1 = ctxPos0 - exon1.getEnd();
                            final int dist_to_exon2 = exon2.getStart() - ctxPos0;
                            if (dist_to_exon2 < dist_to_exon1) {
                                indexInTranscript0++;
                            }
                            if (kg.isNegativeStrand()) {
                                indexInTranscript0 = (kg.getTranscriptLength() - 1) - indexInTranscript0;
                            }
                            trInfo.indexInTranscript0 = indexInTranscript0;
                            trInfo.overlapName = exon1.getNextIntron().getName();
                            sorting.add(trInfo);
                            break;
                        }
                    }
                }
            // end loop over genes
            }
            // end while loop over variants
            progess.finish();
            sorting.doneAdding();
            LOG.info("done adding");
            iter2 = sorting.iterator();
            final EqualRangeIterator<TranscriptInfo> eqiter = new EqualRangeIterator<TranscriptInfo>(iter2, new Comparator<TranscriptInfo>() {

                @Override
                public int compare(final TranscriptInfo o1, final TranscriptInfo o2) {
                    int i = o1.contig.compareTo(o2.contig);
                    if (i != 0)
                        return i;
                    i = o1.transcriptName.compareTo(o2.transcriptName);
                    return i;
                }
            });
            while (eqiter.hasNext()) {
                final List<TranscriptInfo> list = eqiter.next();
                final TranscriptInfo front = list.get(0);
                pw.println("# BEGIN TRANSCRIPT " + front.transcriptName + " ##########################################");
                pw.println("transcript.chrom <- \"" + front.contig + "\"");
                pw.println("transcript.txStart0 <- " + front.txStart + "");
                pw.println("transcript.txEnd0 <- " + front.txEnd + "");
                pw.println("transcript.name <- \"" + front.transcriptName + "\"");
                pw.println("transcript.strand <- \"" + ((char) front.strand) + "\"");
                pw.println("transcript.length <- " + front.transcriptLength + "");
                pw.println("transcript.exonCount <- " + front.exonCount + "");
                pw.println("archetype.name <- \"" + front.archetypeName + "\"");
                pw.println("archetype.length <- " + front.archetypeLength + "");
                pw.print("variants <- data.frame(chrom=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print("\"" + v.contig + "\"");
                    first = false;
                }
                pw.print("),chromStart=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print(v.ctxStart);
                    first = false;
                }
                pw.print("),chromEnd=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print(v.ctxStart + v.ref.length() - 1);
                    first = false;
                }
                pw.print("),refAllele=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print("\"" + v.ref.getDisplayString() + "\"");
                    first = false;
                }
                pw.print("),altAllele=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print("\"" + v.alt.getDisplayString() + "\"");
                    first = false;
                }
                pw.print("),positionInTranscript1=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print(v.indexInTranscript0 + 1);
                    first = false;
                }
                pw.print("),maf=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print(v.maf);
                    first = false;
                }
                pw.print("),overlapName=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print("\"" + v.overlapName + "\"");
                    first = false;
                }
                pw.println("))");
                pw.println("# genotypes as a list. Should be a multiple of length(samples).");
                pw.println("# 0 is homref (0/0), 1 is het (0/1), 2 is homvar (1/1)");
                pw.println("# if the variant contains another ALT allele: (0/2) and (2/2) are considered 0 (homref)");
                pw.print("genotypes <- c(");
                first = true;
                for (final TranscriptInfo tr : list) {
                    for (byte g : tr.genotypes) {
                        if (!first)
                            pw.print(",");
                        first = false;
                        pw.print((int) g);
                    }
                }
                pw.println(")");
                pw.println("stopifnot(NROW(variants) * NROW(population) == length(genotypes) )");
                if (this.userDefinedFunName == null || this.userDefinedFunName.trim().isEmpty()) {
                    pw.println("## WARNING not user-defined R function was defined");
                } else {
                    pw.println("# consumme data with user-defined R function ");
                    pw.println(this.userDefinedFunName + "()");
                }
                pw.println("# END TRANSCRIPT " + front.transcriptName + " ##########################################");
            }
            // end while eqiter
            eqiter.close();
            iter2.close();
            iter2 = null;
            sorting.cleanup();
            sorting = null;
        }
    } finally {
        CloserUtil.close(iter2);
        if (sorting != null)
            sorting.cleanup();
    }
}
Also used : VariantContext(htsjdk.variant.variantcontext.VariantContext) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) TreeSet(java.util.TreeSet) VCFHeader(htsjdk.variant.vcf.VCFHeader) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) Genotype(htsjdk.variant.variantcontext.Genotype) IOException(java.io.IOException) Allele(htsjdk.variant.variantcontext.Allele) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) Interval(htsjdk.samtools.util.Interval)

Example 4 with Pedigree

use of com.github.lindenb.jvarkit.util.Pedigree in project jvarkit by lindenb.

the class CaseControlPlot method doWork.

@Override
public int doWork(final List<String> args) {
    ArchiveFactory archiveFactory = null;
    VcfIterator in = null;
    VariantContextWriter teeVariantWriter = null;
    final List<CaseControlExtractor> excractors = new ArrayList<>();
    try {
        in = super.openVcfIterator(oneFileOrNull(args));
        final VCFHeader header = in.getHeader();
        excractors.addAll(parseConfigFile(header));
        final Pedigree pedigree;
        if (this.pedigreeFile != null) {
            pedigree = Pedigree.newParser().parse(this.pedigreeFile);
        } else {
            pedigree = Pedigree.newParser().parse(header);
        }
        if (pedigree == null || pedigree.isEmpty()) {
            LOG.error("No pedigree defined , or it is empty");
            return -1;
        }
        final Set<Pedigree.Person> casepersons = pedigree.getPersons().stream().filter(F -> F.isAffected() && header.getSampleNamesInOrder().indexOf(F.getId()) != -1).collect(Collectors.toSet());
        if (casepersons.isEmpty()) {
            LOG.error("No Affected individuals in pedigree/header");
            return -1;
        }
        final Set<Pedigree.Person> controlpersons = pedigree.getPersons().stream().filter(F -> F.isUnaffected() && header.getSampleNamesInOrder().indexOf(F.getId()) != -1).collect(Collectors.toSet());
        if (controlpersons.isEmpty()) {
            LOG.error("No Unaffected individuals in pedigree/header");
            return -1;
        }
        if (this.teeToStdout) {
            teeVariantWriter = super.openVariantContextWriter(null);
            teeVariantWriter.writeHeader(header);
        }
        archiveFactory = ArchiveFactory.open(this.outputDirOrZip);
        for (final CaseControlExtractor extractor : excractors) {
            extractor.pw = archiveFactory.openWriter(extractor.name);
        }
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
        while (in.hasNext()) {
            final VariantContext ctx = progress.watch(in.next());
            if (teeVariantWriter != null)
                teeVariantWriter.add(ctx);
            for (final CaseControlExtractor handler : excractors) {
                handler.visit(ctx, casepersons, controlpersons);
            }
        }
        for (final CaseControlExtractor handler : excractors) {
            handler.close();
        }
        progress.finish();
        if (teeVariantWriter != null) {
            teeVariantWriter.close();
            teeVariantWriter = null;
        }
        in.close();
        in = null;
        archiveFactory.close();
        archiveFactory = null;
        return RETURN_OK;
    } catch (final Exception e) {
        LOG.error(e);
        return -1;
    } finally {
        CloserUtil.close(in);
        CloserUtil.close(archiveFactory);
        CloserUtil.close(teeVariantWriter);
        for (final CaseControlExtractor handler : excractors) {
            handler.close();
        }
    }
}
Also used : Allele(htsjdk.variant.variantcontext.Allele) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) StringUtil(htsjdk.samtools.util.StringUtil) Document(org.w3c.dom.Document) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) Map(java.util.Map) Node(org.w3c.dom.Node) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) ScriptException(javax.script.ScriptException) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) Predicate(java.util.function.Predicate) Logger(com.github.lindenb.jvarkit.util.log.Logger) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) Set(java.util.Set) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) File(java.io.File) SimpleBindings(javax.script.SimpleBindings) List(java.util.List) Element(org.w3c.dom.Element) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VcfTools(com.github.lindenb.jvarkit.util.vcf.VcfTools) VariantContext(htsjdk.variant.variantcontext.VariantContext) DocumentBuilderFactory(javax.xml.parsers.DocumentBuilderFactory) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) ScriptException(javax.script.ScriptException) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader)

Example 5 with Pedigree

use of com.github.lindenb.jvarkit.util.Pedigree in project jvarkit by lindenb.

the class VcfOptimizePedForSkat method exec.

private void exec(final long generation, final VCFHeader header, final List<VariantContext> variants, final List<Pedigree.Person> samples, final SkatFactory.SkatExecutor skatExecutor) {
    final Solution solution = new Solution();
    solution.generation = generation;
    String origin = "random";
    if (generation != 0) {
        int nRemove = 1 + this.random.nextInt(this.nSamplesRemove);
        if (generation == 1 && this.bootstrapSamples != null) {
            origin = "bootstrap";
            for (final String sample : this.bootstrapSamples.split("[; ,]")) {
                if (StringUtil.isBlank(sample))
                    continue;
                if (!samples.stream().anyMatch(S -> S.getId().equals(sample))) {
                    throw new JvarkitException.UserError("Sample " + sample + " not found in effective pedigree.");
                }
                LOG.info("bootstraping with " + sample);
                solution.sampleSet.add(sample);
            }
        } else if (generation % 5 == 0 && !this.bestSolutions.isEmpty()) {
            int sol_index = this.random.nextInt(Math.min(this.max_results_output, this.bestSolutions.size()));
            final List<String> list = new ArrayList<>(this.bestSolutions.get(sol_index).sampleSet);
            if (list.size() > 1 && this.random.nextBoolean()) {
                origin = "best-minus-random";
                list.remove(this.random.nextInt(list.size()));
            } else if (list.size() < nRemove) {
                origin = "best-plus-random";
                list.add(samples.get(this.random.nextInt(samples.size())).getId());
            }
            solution.sampleSet.addAll(list);
        } else if (generation % 7 == 0 && this.bestSolutions.size() > 2) {
            final Set<String> set = new HashSet<>(this.bestSolutions.get(0).sampleSet);
            set.addAll(this.bestSolutions.get(1).sampleSet);
            final List<String> bestsamples = new ArrayList<>(set);
            Collections.shuffle(bestsamples, this.random);
            while (bestsamples.size() > nRemove) {
                bestsamples.remove(0);
            }
            solution.sampleSet.addAll(bestsamples);
            origin = "best0-plus-best1";
        } else {
            while (nRemove > 0) {
                final String sampleId;
                if (generation % 3 == 0L && nRemove % 2 == 0 && this.bestSolutions.size() > 0 && !this.bestSolutions.get(0).sampleSet.isEmpty()) {
                    final List<String> bestsamples = new ArrayList<>(this.bestSolutions.get(0).sampleSet);
                    sampleId = bestsamples.get(this.random.nextInt(bestsamples.size()));
                    origin = "random-plus-best0";
                } else {
                    sampleId = samples.get(this.random.nextInt(samples.size())).getId();
                }
                solution.sampleSet.add(sampleId);
                nRemove--;
            }
        }
    } else {
        origin = "original";
    }
    if (generation > 0 && solution.sampleSet.isEmpty())
        return;
    while (solution.sampleSet.size() > this.nSamplesRemove) {
        LOG.warn("Hum... to many for " + origin);
        final List<String> L = new ArrayList<>(solution.sampleSet);
        while (L.size() > 0 && L.size() > this.nSamplesRemove) L.remove(this.random.nextInt(L.size()));
        solution.sampleSet.clear();
        solution.sampleSet.addAll(L);
    }
    if (this.bestSolutions.contains(solution))
        return;
    solution.origin = origin;
    final List<Pedigree.Person> ped2 = new ArrayList<>(samples);
    ped2.removeIf(I -> solution.sampleSet.contains(I.getId()));
    if (ped2.isEmpty())
        return;
    if (!ped2.stream().anyMatch(P -> P.isAffected()))
        return;
    if (!ped2.stream().anyMatch(P -> P.isUnaffected()))
        return;
    // test MAF et Fisher
    final VCFHeader header2 = new VCFHeader(header.getMetaDataInInputOrder(), header.getSampleNamesInOrder());
    final VcfBurdenFisherH.CtxWriterFactory fisherhFactory = new VcfBurdenFisherH.CtxWriterFactory();
    fisherhFactory.setCaseControlExtractor((H) -> new HashSet<>(ped2));
    fisherhFactory.setIgnoreFiltered(true);
    final VcfBurdenMAF.CtxWriterFactory mafFactory = new VcfBurdenMAF.CtxWriterFactory();
    mafFactory.setCaseControlExtractor((H) -> new HashSet<>(ped2));
    mafFactory.setIgnoreFiltered(true);
    final VariantCollector collector = new VariantCollector();
    VariantContextWriter vcw = mafFactory.open(collector);
    vcw = fisherhFactory.open(vcw);
    vcw.writeHeader(header2);
    for (final VariantContext vc : variants) {
        vcw.add(vc);
    }
    vcw.close();
    CloserUtil.close(fisherhFactory);
    CloserUtil.close(mafFactory);
    // 
    solution.result = skatExecutor.execute(collector.variants, ped2);
    if (solution.result.isError())
        return;
    if (this.bestSolutions.isEmpty() || solution.compareTo(this.bestSolutions.get(this.bestSolutions.size() - 1)) < 0) {
        this.bestSolutions.add(solution);
        if (this.firstScore == null) {
            this.firstScore = solution.result.getPValue();
        }
        Collections.sort(this.bestSolutions);
        final int BUFFER_RESULT = 1000;
        while (this.bestSolutions.size() > BUFFER_RESULT) {
            this.bestSolutions.remove(this.bestSolutions.size() - 1);
        }
        if (this.bestSolutions.indexOf(solution) < this.max_results_output) {
            final StringBuilder sb = new StringBuilder();
            sb.append(">>> ").append(generation).append("\n");
            this.bestSolutions.stream().limit(this.max_results_output).forEach(S -> sb.append(S).append("\n"));
            sb.append("<<< ").append(generation).append("\n");
            if (outputFile == null) {
                stdout().println(sb.toString());
            } else {
                stderr().println(sb.toString());
                IOUtils.cat(sb.toString(), this.outputFile, false);
            }
        }
    }
}
Also used : Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) Random(java.util.Random) TreeSet(java.util.TreeSet) ParametersDelegate(com.beust.jcommander.ParametersDelegate) VcfBurdenFisherH(com.github.lindenb.jvarkit.tools.burden.VcfBurdenFisherH) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) StringUtil(htsjdk.samtools.util.StringUtil) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) CloserUtil(htsjdk.samtools.util.CloserUtil) Logger(com.github.lindenb.jvarkit.util.log.Logger) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) Set(java.util.Set) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) File(java.io.File) VcfBurdenMAF(com.github.lindenb.jvarkit.tools.burden.VcfBurdenMAF) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VariantContext(htsjdk.variant.variantcontext.VariantContext) Collections(java.util.Collections) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) VcfBurdenMAF(com.github.lindenb.jvarkit.tools.burden.VcfBurdenMAF) ArrayList(java.util.ArrayList) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) VcfBurdenFisherH(com.github.lindenb.jvarkit.tools.burden.VcfBurdenFisherH) HashSet(java.util.HashSet)

Aggregations

Pedigree (com.github.lindenb.jvarkit.util.Pedigree)13 VariantContext (htsjdk.variant.variantcontext.VariantContext)12 VCFHeader (htsjdk.variant.vcf.VCFHeader)11 Parameter (com.beust.jcommander.Parameter)10 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)10 Logger (com.github.lindenb.jvarkit.util.log.Logger)10 File (java.io.File)10 List (java.util.List)10 Program (com.github.lindenb.jvarkit.util.jcommander.Program)9 VcfIterator (com.github.lindenb.jvarkit.util.vcf.VcfIterator)9 ArrayList (java.util.ArrayList)9 Collectors (java.util.stream.Collectors)9 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)8 CloserUtil (htsjdk.samtools.util.CloserUtil)8 Set (java.util.Set)8 JvarkitException (com.github.lindenb.jvarkit.lang.JvarkitException)7 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)7 ParametersDelegate (com.beust.jcommander.ParametersDelegate)6 StringUtil (htsjdk.samtools.util.StringUtil)6 Genotype (htsjdk.variant.variantcontext.Genotype)6