Search in sources :

Example 6 with VcfIterator

use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.

the class HaloplexParasite method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
    SamReader samReader = null;
    final List<Mutation> mutations = new ArrayList<>();
    try {
        final Set<File> bamFiles = Files.lines(this.bamList.toPath()).filter(T -> !(T.isEmpty() || T.startsWith("#"))).map(T -> new File(T)).collect(Collectors.toSet());
        final VCFHeader header = new VCFHeader();
        header.setSequenceDictionary(in.getHeader().getSequenceDictionary());
        final VCFFilterHeaderLine filter = new VCFFilterHeaderLine("HALOPLEXPARASITE", "fails Parasite Haloplex Sequence");
        final VCFInfoHeaderLine infoWord = new VCFInfoHeaderLine(filter.getID(), 1, VCFHeaderLineType.String, "Parasite Haloplex Sequence (Word|Count|Fraction)");
        super.addMetaData(header);
        out.writeHeader(header);
        header.addMetaDataLine(filter);
        header.addMetaDataLine(infoWord);
        while (in.hasNext()) {
            final VariantContext ctx = in.next();
            final VariantContextBuilder vcb = new VariantContextBuilder(inputName, ctx.getContig(), ctx.getStart(), ctx.getEnd(), ctx.getAlleles());
            if (!(ctx.isIndel() || ctx.isMixed())) {
                out.add(vcb.make());
                continue;
            }
            if (!vcb.getAlleles().stream().filter(A -> !(A.isSymbolic() || A.isNoCall() || A.length() < this.minClipSize)).findAny().isPresent()) {
                out.add(vcb.make());
                continue;
            }
            final Mutation mut = new Mutation(ctx);
            mutations.add(mut);
        }
        final Counter<String> words = new Counter<>();
        for (final File bamFile : bamFiles) {
            LOG.info("Opening " + bamFile);
            samReader = createSamReaderFactory().open(bamFile);
            for (final Mutation mut : mutations) {
                // words seen in that mutation : don't use a Counter
                final Set<String> mutWords = new HashSet<>();
                /* loop over reads overlapping that mutation */
                final SAMRecordIterator sri = samReader.queryOverlapping(mut.contig, mut.start, mut.end);
                while (sri.hasNext()) {
                    final SAMRecord rec = sri.next();
                    if (rec.getReadUnmappedFlag())
                        continue;
                    if (rec.isSecondaryOrSupplementary())
                        continue;
                    if (rec.getDuplicateReadFlag())
                        continue;
                    if (rec.getReadFailsVendorQualityCheckFlag())
                        continue;
                    final Cigar cigar = rec.getCigar();
                    if (cigar.numCigarElements() == 1)
                        continue;
                    final byte[] bases = rec.getReadBases();
                    int refpos = rec.getUnclippedStart();
                    int readpos = 0;
                    /* scan cigar overlapping that mutation */
                    for (final CigarElement ce : cigar) {
                        final CigarOperator op = ce.getOperator();
                        final int ref_end = refpos + (op.consumesReferenceBases() || op.isClipping() ? ce.getLength() : 0);
                        final int read_end = readpos + (op.consumesReadBases() ? ce.getLength() : 0);
                        /* check clip is large enough */
                        if (op.equals(CigarOperator.S) && ce.getLength() >= this.minClipSize) {
                            /* check clip overlap mutation */
                            if (!(ref_end < mut.start || refpos > mut.end)) {
                                /* break read of soft clip into words */
                                for (int x = 0; x + this.minClipSize <= ce.getLength(); ++x) {
                                    final String substr = new String(bases, readpos + x, this.minClipSize);
                                    if (!substr.contains("N")) {
                                        final String revcomp = AcidNucleics.reverseComplement(substr);
                                        mutWords.add(substr);
                                        if (!revcomp.equals(substr))
                                            mutWords.add(revcomp);
                                    }
                                }
                            }
                        }
                        refpos = ref_end;
                        readpos = read_end;
                    }
                }
                sri.close();
                for (final String w : mutWords) {
                    words.incr(w);
                }
            }
            // end of for(mutations)
            samReader.close();
            samReader = null;
        }
        LOG.info("mutations:" + mutations.size());
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
        for (final Mutation mut : mutations) {
            progress.watch(mut.contig, mut.start);
            final VariantContextBuilder vcb = new VariantContextBuilder(inputName, mut.contig, mut.start, mut.end, mut.alleles);
            String worstString = null;
            Double worstFraction = null;
            final double totalWords = words.getTotal();
            for (final Allele a : mut.alleles) {
                if (a.isSymbolic() || a.isNoCall() || a.length() < this.minClipSize)
                    continue;
                for (int x = 0; x + this.minClipSize <= a.length(); ++x) {
                    final String substr = new String(a.getBases(), x, this.minClipSize);
                    final long count = words.count(substr);
                    final double fraction = count / totalWords;
                    if (worstFraction == null || worstFraction < fraction) {
                        worstString = substr + "|" + count + "|" + fraction;
                        worstFraction = fraction;
                    }
                }
            }
            if (worstString != null) {
                vcb.attribute(infoWord.getID(), worstString);
            }
            if (worstFraction != null && worstFraction.doubleValue() >= this.tresholdFraction) {
                vcb.filter(filter.getID());
            }
            out.add(vcb.make());
        }
        progress.finish();
        return RETURN_OK;
    } catch (Exception e) {
        LOG.error(e);
        return -1;
    } finally {
        CloserUtil.close(samReader);
    }
}
Also used : Cigar(htsjdk.samtools.Cigar) Allele(htsjdk.variant.variantcontext.Allele) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) CigarElement(htsjdk.samtools.CigarElement) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) CigarOperator(htsjdk.samtools.CigarOperator) AcidNucleics(com.github.lindenb.jvarkit.util.bio.AcidNucleics) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) CloserUtil(htsjdk.samtools.util.CloserUtil) Counter(com.github.lindenb.jvarkit.util.Counter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) Files(java.nio.file.Files) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) SamReader(htsjdk.samtools.SamReader) Collectors(java.util.stream.Collectors) File(java.io.File) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SamReader(htsjdk.samtools.SamReader) Counter(com.github.lindenb.jvarkit.util.Counter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) CigarOperator(htsjdk.samtools.CigarOperator) CigarElement(htsjdk.samtools.CigarElement) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) Cigar(htsjdk.samtools.Cigar) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) SAMRecord(htsjdk.samtools.SAMRecord) File(java.io.File)

Example 7 with VcfIterator

use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.

the class VcfDoest method run.

private void run(final LineIterator lr, final PrintWriter pw) throws IOException {
    SortingCollection<TranscriptInfo> sorting = null;
    CloseableIterator<TranscriptInfo> iter2 = null;
    try {
        while (lr.hasNext()) {
            VcfIterator in = VCFUtils.createVcfIteratorFromLineIterator(lr, true);
            final VCFHeader header = in.getHeader();
            final Pedigree pedigree = Pedigree.newParser().parse(header);
            if (pedigree.isEmpty()) {
                throw new IOException("No pedigree found in header VCF header. use VcfInjectPedigree to add it");
            }
            final SortedSet<Pedigree.Person> individuals = new TreeSet<>();
            for (final Pedigree.Person individual : pedigree.getPersons()) {
                if (individual.isAffected() || individual.isUnaffected()) {
                    individuals.add(individual);
                }
            }
            boolean first = true;
            pw.println("# samples ( 0: unaffected 1:affected)");
            pw.print("population <- data.frame(family=c(");
            first = true;
            for (final Pedigree.Person person : individuals) {
                if (!first)
                    pw.print(",");
                pw.print("\"" + person.getFamily().getId() + "\"");
                first = false;
            }
            pw.print("),name=c(");
            first = true;
            for (final Pedigree.Person person : individuals) {
                if (!first)
                    pw.print(",");
                pw.print("\"" + person.getId() + "\"");
                first = false;
            }
            pw.print("),status=c(");
            first = true;
            for (final Pedigree.Person person : individuals) {
                if (!first)
                    pw.print(",");
                pw.print(person.isUnaffected() ? 0 : 1);
                first = false;
            }
            pw.println("))");
            sorting = SortingCollection.newInstance(TranscriptInfo.class, new TranscriptInfoCodec(), new TranscriptInfoCmp(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
            sorting.setDestructiveIteration(true);
            final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(header.getSequenceDictionary());
            /**
             * loop over variants
             */
            while (in.hasNext() && !pw.checkError()) {
                final VariantContext ctx = progess.watch(in.next());
                if (ctx.isFiltered())
                    continue;
                if (ctx.getAlternateAlleles().isEmpty())
                    continue;
                final Allele altAllele = ctx.getAltAlleleWithHighestAlleleCount();
                final MafCalculator mafCalculator = new MafCalculator(altAllele, ctx.getContig());
                boolean genotyped = false;
                for (final Pedigree.Person p : pedigree.getPersons()) {
                    if (!(p.isAffected() || p.isUnaffected()))
                        continue;
                    final Genotype g = ctx.getGenotype(p.getId());
                    if (g == null)
                        throw new IOException("Strange I cannot find individual " + p + " in the pedigree. Aborting.");
                    if (g.isCalled()) {
                        mafCalculator.add(g, p.isMale());
                    }
                    if (g.isHet() || g.isHomVar()) {
                        if (!g.getAlleles().contains(altAllele))
                            continue;
                        genotyped = true;
                        break;
                    }
                }
                if (!genotyped)
                    continue;
                final Interval interval = new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd());
                final List<KnownGene> genes = this.overlap(interval);
                if (genes.isEmpty())
                    continue;
                for (final KnownGene kg : genes) {
                    final TranscriptInfo trInfo = new TranscriptInfo();
                    trInfo.contig = kg.getContig();
                    trInfo.txStart = kg.getTxStart();
                    trInfo.txEnd = kg.getTxEnd();
                    trInfo.transcriptName = kg.getName();
                    trInfo.strand = (byte) (kg.isPositiveStrand() ? '+' : '-');
                    trInfo.exonCount = kg.getExonCount();
                    trInfo.transcriptLength = kg.getTranscriptLength();
                    trInfo.ctxStart = ctx.getStart();
                    trInfo.ref = ctx.getReference();
                    trInfo.alt = altAllele;
                    trInfo.maf = mafCalculator.getMaf();
                    trInfo.genotypes = new byte[individuals.size()];
                    int idx = 0;
                    for (final Pedigree.Person individual : individuals) {
                        final Genotype genotype = ctx.getGenotype(individual.getId());
                        final byte b;
                        if (genotype.isHomRef()) {
                            b = 0;
                        } else if (genotype.isHomVar() && genotype.getAlleles().contains(altAllele)) {
                            b = 2;
                        } else if (genotype.isHet() && genotype.getAlleles().contains(altAllele) && genotype.getAlleles().contains(ctx.getReference())) {
                            b = 1;
                        } else /* we treat 0/2 has hom-ref */
                        if (genotype.isHet() && !genotype.getAlleles().contains(altAllele) && genotype.getAlleles().contains(ctx.getReference())) {
                            LOG.warn("Treating " + genotype + " as hom-ref (0) alt=" + altAllele);
                            b = 0;
                        } else /* we treat 2/2 has hom-ref */
                        if (genotype.isHomVar() && !genotype.getAlleles().contains(altAllele)) {
                            LOG.warn("Treating " + genotype + " as hom-ref (0) alt=" + altAllele);
                            b = 0;
                        } else {
                            b = -9;
                        }
                        trInfo.genotypes[idx] = b;
                        ++idx;
                    }
                    KnownGene archetype = kg;
                    /* find gene archetype = longest overlapping */
                    for (final KnownGene kg2 : genes) {
                        if (kg2 == kg)
                            continue;
                        if (archetype.getStrand().equals(kg2.getStrand()) && archetype.getTranscriptLength() < kg2.getTranscriptLength()) {
                            archetype = kg2;
                        }
                    }
                    trInfo.archetypeName = archetype.getName();
                    trInfo.archetypeLength = archetype.getTranscriptLength();
                    boolean ctxWasFoundInExon = false;
                    final int ctxPos0 = ctx.getStart() - 1;
                    int indexInTranscript0 = 0;
                    for (final KnownGene.Exon exon : kg.getExons()) {
                        // variant in exon ?
                        if (!(exon.getStart() > (ctx.getEnd() - 1) || (ctx.getStart() - 1) >= exon.getEnd())) {
                            ctxWasFoundInExon = true;
                            indexInTranscript0 += (ctxPos0 - exon.getStart());
                            if (kg.isNegativeStrand()) {
                                indexInTranscript0 = (kg.getTranscriptLength() - 1) - indexInTranscript0;
                            }
                            trInfo.indexInTranscript0 = indexInTranscript0;
                            trInfo.overlapName = exon.getName();
                            sorting.add(trInfo);
                            break;
                        } else {
                            indexInTranscript0 += (exon.getEnd() - exon.getStart());
                        }
                    }
                    if (ctxWasFoundInExon) {
                        continue;
                    }
                    indexInTranscript0 = 0;
                    // search closest intron/exon junction
                    for (int ex = 0; ex + 1 < kg.getExonCount(); ++ex) {
                        final KnownGene.Exon exon1 = kg.getExon(ex);
                        indexInTranscript0 += (exon1.getEnd() - exon1.getStart());
                        final KnownGene.Exon exon2 = kg.getExon(ex + 1);
                        if (exon1.getEnd() <= ctxPos0 && ctxPos0 < exon2.getStart()) {
                            final int dist_to_exon1 = ctxPos0 - exon1.getEnd();
                            final int dist_to_exon2 = exon2.getStart() - ctxPos0;
                            if (dist_to_exon2 < dist_to_exon1) {
                                indexInTranscript0++;
                            }
                            if (kg.isNegativeStrand()) {
                                indexInTranscript0 = (kg.getTranscriptLength() - 1) - indexInTranscript0;
                            }
                            trInfo.indexInTranscript0 = indexInTranscript0;
                            trInfo.overlapName = exon1.getNextIntron().getName();
                            sorting.add(trInfo);
                            break;
                        }
                    }
                }
            // end loop over genes
            }
            // end while loop over variants
            progess.finish();
            sorting.doneAdding();
            LOG.info("done adding");
            iter2 = sorting.iterator();
            final EqualRangeIterator<TranscriptInfo> eqiter = new EqualRangeIterator<TranscriptInfo>(iter2, new Comparator<TranscriptInfo>() {

                @Override
                public int compare(final TranscriptInfo o1, final TranscriptInfo o2) {
                    int i = o1.contig.compareTo(o2.contig);
                    if (i != 0)
                        return i;
                    i = o1.transcriptName.compareTo(o2.transcriptName);
                    return i;
                }
            });
            while (eqiter.hasNext()) {
                final List<TranscriptInfo> list = eqiter.next();
                final TranscriptInfo front = list.get(0);
                pw.println("# BEGIN TRANSCRIPT " + front.transcriptName + " ##########################################");
                pw.println("transcript.chrom <- \"" + front.contig + "\"");
                pw.println("transcript.txStart0 <- " + front.txStart + "");
                pw.println("transcript.txEnd0 <- " + front.txEnd + "");
                pw.println("transcript.name <- \"" + front.transcriptName + "\"");
                pw.println("transcript.strand <- \"" + ((char) front.strand) + "\"");
                pw.println("transcript.length <- " + front.transcriptLength + "");
                pw.println("transcript.exonCount <- " + front.exonCount + "");
                pw.println("archetype.name <- \"" + front.archetypeName + "\"");
                pw.println("archetype.length <- " + front.archetypeLength + "");
                pw.print("variants <- data.frame(chrom=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print("\"" + v.contig + "\"");
                    first = false;
                }
                pw.print("),chromStart=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print(v.ctxStart);
                    first = false;
                }
                pw.print("),chromEnd=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print(v.ctxStart + v.ref.length() - 1);
                    first = false;
                }
                pw.print("),refAllele=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print("\"" + v.ref.getDisplayString() + "\"");
                    first = false;
                }
                pw.print("),altAllele=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print("\"" + v.alt.getDisplayString() + "\"");
                    first = false;
                }
                pw.print("),positionInTranscript1=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print(v.indexInTranscript0 + 1);
                    first = false;
                }
                pw.print("),maf=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print(v.maf);
                    first = false;
                }
                pw.print("),overlapName=c(");
                first = true;
                for (final TranscriptInfo v : list) {
                    if (!first)
                        pw.print(",");
                    pw.print("\"" + v.overlapName + "\"");
                    first = false;
                }
                pw.println("))");
                pw.println("# genotypes as a list. Should be a multiple of length(samples).");
                pw.println("# 0 is homref (0/0), 1 is het (0/1), 2 is homvar (1/1)");
                pw.println("# if the variant contains another ALT allele: (0/2) and (2/2) are considered 0 (homref)");
                pw.print("genotypes <- c(");
                first = true;
                for (final TranscriptInfo tr : list) {
                    for (byte g : tr.genotypes) {
                        if (!first)
                            pw.print(",");
                        first = false;
                        pw.print((int) g);
                    }
                }
                pw.println(")");
                pw.println("stopifnot(NROW(variants) * NROW(population) == length(genotypes) )");
                if (this.userDefinedFunName == null || this.userDefinedFunName.trim().isEmpty()) {
                    pw.println("## WARNING not user-defined R function was defined");
                } else {
                    pw.println("# consumme data with user-defined R function ");
                    pw.println(this.userDefinedFunName + "()");
                }
                pw.println("# END TRANSCRIPT " + front.transcriptName + " ##########################################");
            }
            // end while eqiter
            eqiter.close();
            iter2.close();
            iter2 = null;
            sorting.cleanup();
            sorting = null;
        }
    } finally {
        CloserUtil.close(iter2);
        if (sorting != null)
            sorting.cleanup();
    }
}
Also used : VariantContext(htsjdk.variant.variantcontext.VariantContext) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) TreeSet(java.util.TreeSet) VCFHeader(htsjdk.variant.vcf.VCFHeader) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) Genotype(htsjdk.variant.variantcontext.Genotype) IOException(java.io.IOException) Allele(htsjdk.variant.variantcontext.Allele) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) Interval(htsjdk.samtools.util.Interval)

Example 8 with VcfIterator

use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.

the class CaseControlPlot method doWork.

@Override
public int doWork(final List<String> args) {
    ArchiveFactory archiveFactory = null;
    VcfIterator in = null;
    VariantContextWriter teeVariantWriter = null;
    final List<CaseControlExtractor> excractors = new ArrayList<>();
    try {
        in = super.openVcfIterator(oneFileOrNull(args));
        final VCFHeader header = in.getHeader();
        excractors.addAll(parseConfigFile(header));
        final Pedigree pedigree;
        if (this.pedigreeFile != null) {
            pedigree = Pedigree.newParser().parse(this.pedigreeFile);
        } else {
            pedigree = Pedigree.newParser().parse(header);
        }
        if (pedigree == null || pedigree.isEmpty()) {
            LOG.error("No pedigree defined , or it is empty");
            return -1;
        }
        final Set<Pedigree.Person> casepersons = pedigree.getPersons().stream().filter(F -> F.isAffected() && header.getSampleNamesInOrder().indexOf(F.getId()) != -1).collect(Collectors.toSet());
        if (casepersons.isEmpty()) {
            LOG.error("No Affected individuals in pedigree/header");
            return -1;
        }
        final Set<Pedigree.Person> controlpersons = pedigree.getPersons().stream().filter(F -> F.isUnaffected() && header.getSampleNamesInOrder().indexOf(F.getId()) != -1).collect(Collectors.toSet());
        if (controlpersons.isEmpty()) {
            LOG.error("No Unaffected individuals in pedigree/header");
            return -1;
        }
        if (this.teeToStdout) {
            teeVariantWriter = super.openVariantContextWriter(null);
            teeVariantWriter.writeHeader(header);
        }
        archiveFactory = ArchiveFactory.open(this.outputDirOrZip);
        for (final CaseControlExtractor extractor : excractors) {
            extractor.pw = archiveFactory.openWriter(extractor.name);
        }
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
        while (in.hasNext()) {
            final VariantContext ctx = progress.watch(in.next());
            if (teeVariantWriter != null)
                teeVariantWriter.add(ctx);
            for (final CaseControlExtractor handler : excractors) {
                handler.visit(ctx, casepersons, controlpersons);
            }
        }
        for (final CaseControlExtractor handler : excractors) {
            handler.close();
        }
        progress.finish();
        if (teeVariantWriter != null) {
            teeVariantWriter.close();
            teeVariantWriter = null;
        }
        in.close();
        in = null;
        archiveFactory.close();
        archiveFactory = null;
        return RETURN_OK;
    } catch (final Exception e) {
        LOG.error(e);
        return -1;
    } finally {
        CloserUtil.close(in);
        CloserUtil.close(archiveFactory);
        CloserUtil.close(teeVariantWriter);
        for (final CaseControlExtractor handler : excractors) {
            handler.close();
        }
    }
}
Also used : Allele(htsjdk.variant.variantcontext.Allele) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) StringUtil(htsjdk.samtools.util.StringUtil) Document(org.w3c.dom.Document) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) Map(java.util.Map) Node(org.w3c.dom.Node) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) ScriptException(javax.script.ScriptException) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) Predicate(java.util.function.Predicate) Logger(com.github.lindenb.jvarkit.util.log.Logger) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) Set(java.util.Set) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) File(java.io.File) SimpleBindings(javax.script.SimpleBindings) List(java.util.List) Element(org.w3c.dom.Element) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VcfTools(com.github.lindenb.jvarkit.util.vcf.VcfTools) VariantContext(htsjdk.variant.variantcontext.VariantContext) DocumentBuilderFactory(javax.xml.parsers.DocumentBuilderFactory) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) ScriptException(javax.script.ScriptException) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader)

Example 9 with VcfIterator

use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.

the class VcfBurdenFilterGenes method doVcfToVcf.

@Override
protected int doVcfToVcf(final String inputName, final VcfIterator in, final VariantContextWriter out) {
    final VCFHeader header = in.getHeader();
    try {
        final VCFHeader h2 = addMetaData(new VCFHeader(header));
        final VCFFilterHeaderLine filterControlsHeader;
        if (!StringUtil.isBlank(this.filterTag)) {
            filterControlsHeader = new VCFFilterHeaderLine(this.filterTag.trim(), "Genes not in list " + this.geneFile);
            h2.addMetaDataLine(filterControlsHeader);
        } else {
            filterControlsHeader = null;
        }
        final List<String> lookColumns = Arrays.asList("CCDS", "Feature", "ENSP", "Gene", "HGNC", "HGNC_ID", "SYMBOL", "RefSeq");
        final VepPredictionParser vepParser = new VepPredictionParserFactory(header).get();
        final AnnPredictionParser annParser = new AnnPredictionParserFactory(header).get();
        final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(header.getSequenceDictionary()).logger(LOG);
        out.writeHeader(h2);
        while (in.hasNext() && !out.checkError()) {
            final VariantContext ctx = progess.watch(in.next());
            boolean keep = false;
            final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
            // not just set FILTER ?
            if (filterControlsHeader == null) {
                vcb.rmAttribute(vepParser.getTag());
                vcb.rmAttribute(annParser.getTag());
            }
            final List<String> newVepList = new ArrayList<>();
            for (final String predStr : ctx.getAttributeAsList(vepParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
                final VepPredictionParser.VepPrediction pred = vepParser.parseOnePrediction(ctx, predStr);
                for (final String col : lookColumns) {
                    final String token = pred.getByCol(col);
                    if (!StringUtil.isBlank(token) && this.geneNames.contains(token)) {
                        newVepList.add(predStr);
                        keep = true;
                        break;
                    }
                }
            }
            final List<String> newEffList = new ArrayList<>();
            for (final String predStr : ctx.getAttributeAsList(annParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
                final AnnPredictionParser.AnnPrediction pred = annParser.parseOnePrediction(predStr);
                final String token = pred.getGeneName();
                if (!StringUtil.isBlank(token) && this.geneNames.contains(token)) {
                    newEffList.add(predStr);
                    keep = true;
                    break;
                }
            }
            // not just set FILTER ?
            if (filterControlsHeader == null) {
                if (!newVepList.isEmpty())
                    vcb.attribute(vepParser.getTag(), newVepList);
                if (!newEffList.isEmpty())
                    vcb.attribute(annParser.getTag(), newEffList);
            }
            if (filterControlsHeader != null) {
                if (!keep) {
                    vcb.filter(filterControlsHeader.getID());
                } else if (!ctx.isFiltered()) {
                    vcb.passFilters();
                }
                out.add(vcb.make());
            } else {
                if (keep)
                    out.add(vcb.make());
            }
        }
        progess.finish();
        return RETURN_OK;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(in);
    }
}
Also used : AnnPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser) Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) AnnPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) StringUtil(htsjdk.samtools.util.StringUtil) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) VepPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory) CloserUtil(htsjdk.samtools.util.CloserUtil) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) Files(java.nio.file.Files) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) File(java.io.File) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) AnnPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParserFactory) VariantContext(htsjdk.variant.variantcontext.VariantContext) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) AnnPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParserFactory) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) VepPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory)

Example 10 with VcfIterator

use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.

the class VcfToSql method read.

private void read(File filename) throws IOException {
    /* insert ATGC */
    this.alleleTable.insert(outputWriter, null, "A");
    this.alleleTable.insert(outputWriter, null, "C");
    this.alleleTable.insert(outputWriter, null, "G");
    this.alleleTable.insert(outputWriter, null, "T");
    /* insert this sample */
    this.vcfFileTable.insert(outputWriter, null, filename);
    final SelectStmt vcffile_id = new SelectStmt(this.vcfFileTable);
    final Map<String, SelectStmt> sample2sampleid = new HashMap<String, SelectStmt>();
    final Map<String, SelectStmt> filter2filterid = new HashMap<String, SelectStmt>();
    final Map<String, SelectStmt> chrom2chromId = new HashMap<String, SelectStmt>();
    final VcfIterator r = VCFUtils.createVcfIteratorFromFile(filename);
    final VCFHeader header = r.getHeader();
    /* parse samples */
    for (final String sampleName : header.getSampleNamesInOrder()) {
        this.sampleTable.insert(outputWriter, null, sampleName);
        SelectStmt sample_id = new SelectStmt(this.sampleTable, "name", sampleName);
        sample2sampleid.put(sampleName, sample_id);
        this.sample2fileTable.insert(outputWriter, null, vcffile_id, sample_id);
    }
    /* parse filters */
    for (final VCFFilterHeaderLine filter : header.getFilterLines()) {
        this.filterTable.insert(outputWriter, null, vcffile_id, filter.getID(), filter.getValue());
        filter2filterid.put(filter.getID(), new SelectStmt(this.filterTable, "name", filter.getID()));
    }
    filter2filterid.put(VCFConstants.PASSES_FILTERS_v4, new SelectStmt(this.filterTable, "name", VCFConstants.PASSES_FILTERS_v4));
    final SAMSequenceDictionary dict = header.getSequenceDictionary();
    if (dict == null) {
        throw new RuntimeException("dictionary missing in VCF");
    }
    /* parse sequence dict */
    for (final SAMSequenceRecord ssr : dict.getSequences()) {
        this.chromosomeTable.insert(outputWriter, null, vcffile_id, ssr.getSequenceName(), ssr.getSequenceLength());
        chrom2chromId.put(ssr.getSequenceName(), new SelectStmt(this.chromosomeTable, "name", ssr.getSequenceName()));
    }
    VepPredictionParser vepPredictionParser = new VepPredictionParserFactory(header).get();
    SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
    int nVariants = 0;
    while (r.hasNext()) {
        if (this.outputWriter.checkError())
            break;
        VariantContext var = progress.watch(r.next());
        ++nVariants;
        /* insert ref allele */
        this.alleleTable.insert(outputWriter, null, var.getReference().getBaseString());
        /* insert variant */
        this.variantTable.insert(outputWriter, null, vcffile_id, nVariants, chrom2chromId.get(var.getContig()), var.getStart(), (var.hasID() ? var.getID() : null), new SelectStmt(this.alleleTable, "bases", var.getReference().getBaseString()), (var.hasLog10PError() ? var.getPhredScaledQual() : null));
        SelectStmt variant_id = new SelectStmt(variantTable);
        /* insert alternate alleles */
        for (Allele alt : var.getAlternateAlleles()) {
            /* insert alt allele */
            this.alleleTable.insert(outputWriter, null, alt.getBaseString());
            this.variant2altTable.insert(outputWriter, null, variant_id, new SelectStmt(this.alleleTable, "bases", alt.getBaseString()));
        }
        /* insert filters */
        for (final String filter : var.getFilters()) {
            if (filter2filterid.get(filter) == null) {
                throw new IOException("VCF Error: filter " + filter + " is not defined in the VCF header.");
            }
            this.variant2filters.insert(outputWriter, null, variant_id, filter2filterid.get(filter));
        }
        if (!this.ignore_info) {
            for (final VepPrediction pred : vepPredictionParser.getPredictions(var)) {
            /*
					vepPrediction.insert(
							outputWriter,
							null,
							variant_id,
							pred.getEnsemblGene(),
							pred.getEnsemblTranscript(),
							pred.getEnsemblProtein(),
							pred.getSymbol()
							);
					SelectStmt pred_id = new SelectStmt(vepPrediction);
			
					for(SequenceOntologyTree.Term t: pred.getSOTerms())
						{
						String term=t.getAcn().replace(':', '_');
						soTermTable.insert(
								outputWriter,
								null,
								term,
								t.getAcn()
								);//for bioportal compatibility
						SelectStmt term_id = new SelectStmt(soTermTable,"acn",term);
						
						vepPrediction2so.insert(
							outputWriter,
							null,
							pred_id,
							term_id
							);
						}
					*/
            }
        }
        /* insert genotypes */
        for (final String sampleName : sample2sampleid.keySet()) {
            final Genotype g = var.getGenotype(sampleName);
            if (!g.isAvailable() || g.isNoCall())
                continue;
            genotypeTable.insert(outputWriter, null, variant_id, sample2sampleid.get(sampleName), g.isCalled() ? new SelectStmt(this.alleleTable, "bases", g.getAllele(0).getBaseString()) : null, g.isCalled() ? new SelectStmt(this.alleleTable, "bases", g.getAllele(1).getBaseString()) : null, g.hasDP() ? g.getDP() : null, g.hasGQ() ? g.getGQ() : null);
        }
    }
    r.close();
}
Also used : VepPrediction(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser.VepPrediction) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) HashMap(java.util.HashMap) VariantContext(htsjdk.variant.variantcontext.VariantContext) Genotype(htsjdk.variant.variantcontext.Genotype) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) IOException(java.io.IOException) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) Allele(htsjdk.variant.variantcontext.Allele) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) VepPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory)

Aggregations

VcfIterator (com.github.lindenb.jvarkit.util.vcf.VcfIterator)55 VariantContext (htsjdk.variant.variantcontext.VariantContext)39 VCFHeader (htsjdk.variant.vcf.VCFHeader)35 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)30 ArrayList (java.util.ArrayList)28 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)26 IOException (java.io.IOException)24 File (java.io.File)22 HashSet (java.util.HashSet)19 List (java.util.List)19 Genotype (htsjdk.variant.variantcontext.Genotype)18 Parameter (com.beust.jcommander.Parameter)17 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)17 Program (com.github.lindenb.jvarkit.util.jcommander.Program)17 Logger (com.github.lindenb.jvarkit.util.log.Logger)17 Set (java.util.Set)17 Allele (htsjdk.variant.variantcontext.Allele)16 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)15 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)15 JvarkitException (com.github.lindenb.jvarkit.lang.JvarkitException)14