Search in sources :

Example 91 with VariantContextWriter

use of htsjdk.variant.variantcontext.writer.VariantContextWriter in project jvarkit by lindenb.

the class FastGenotypeGVCFs method doWork.

@Override
public int doWork(final List<String> args) {
    VariantContextWriter w = null;
    try {
        this.gvcfSources.addAll(IOUtil.unrollFiles(args.stream().map(S -> new File(S)).collect(Collectors.toSet()), ".g.vcf", ".g.vcf.gz").stream().map(F -> new Source(F)).collect(Collectors.toList()));
        if (args.isEmpty()) {
            LOG.error("No gvcf file was given");
            return -1;
        }
        this.gvcfSources.stream().forEach(S -> S.open());
        this.dictionary = this.gvcfSources.get(0).vcfFileReader.getFileHeader().getSequenceDictionary();
        if (this.dictionary == null) {
            LOG.error("Dict missing in " + this.gvcfSources.get(0).gvcfFile);
            return -1;
        }
        this.gvcfSources.stream().map(S -> S.vcfFileReader.getFileHeader().getSequenceDictionary()).forEach(D -> {
            if (D == null || !SequenceUtil.areSequenceDictionariesEqual(D, dictionary)) {
                throw new JvarkitException.UserError("dict missing or dict are not the same");
            }
        });
        if (gvcfSources.stream().map(S -> S.sampleName).collect(Collectors.toSet()).stream().count() != this.gvcfSources.size()) {
            LOG.error("Duplicate sample name. check input");
            return -1;
        }
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_KEY, VCFConstants.GENOTYPE_ALLELE_DEPTHS, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_QUALITY_KEY, VCFConstants.GENOTYPE_PL_KEY);
        metaData.addAll(gvcfSources.stream().flatMap(S -> S.vcfFileReader.getFileHeader().getFormatHeaderLines().stream()).collect(Collectors.toSet()));
        final VCFHeader header = new VCFHeader(metaData, gvcfSources.stream().map(S -> S.sampleName).collect(Collectors.toList()));
        w = super.openVariantContextWriter(outputFile);
        w.writeHeader(header);
        int contigTid = 0;
        while (contigTid < dictionary.size()) {
            final SAMSequenceRecord ssr = this.dictionary.getSequence(contigTid);
            int pos = 0;
            while (pos < ssr.getSequenceLength()) {
                // LOG.debug(ssr.getSequenceName()+" "+pos+" "+this.gvcfSources.size());
                GvcfVariant variantAtThisPos = null;
                int minEnd = ssr.getSequenceLength();
                // cleanup
                for (final Source src : this.gvcfSources) {
                    for (; ; ) {
                        final GvcfThing gvcfthing = src.get();
                        // LOG.debug(""+gvcfthing+" "+src.sampleName+" "+ssr.getSequenceName()+":"+pos);
                        if (gvcfthing == null) {
                            // no more variant avaialble
                            break;
                        } else if (contigTid > gvcfthing.getTid()) {
                            // observed contig is after gvcfthing.contig
                            src.current = null;
                            continue;
                        } else if (contigTid < gvcfthing.getTid()) {
                            // observed contig is before gvcfthing.contig
                            break;
                        } else if (gvcfthing.getEnd() < pos) {
                            // variant information is before observed pos
                            src.current = null;
                            continue;
                        } else if (gvcfthing.getStart() > pos) {
                            // variant information is after observed pos
                            minEnd = Math.min(minEnd, gvcfthing.getStart() - 1);
                            break;
                        } else if (gvcfthing.isVariant()) {
                            if (variantAtThisPos == null || variantContigPosRefComparator.compare(GvcfVariant.class.cast(gvcfthing).ctx, variantAtThisPos.ctx) < 0) {
                                variantAtThisPos = GvcfVariant.class.cast(gvcfthing);
                            }
                            break;
                        } else if (gvcfthing.isBlock()) {
                            minEnd = Math.min(minEnd, gvcfthing.getEnd());
                            break;
                        } else {
                            LOG.debug("??");
                        }
                    }
                }
                if (variantAtThisPos == null) {
                    pos = minEnd + 1;
                } else {
                    final VariantContext archetype = variantAtThisPos.ctx;
                    final List<VariantContext> allVariants = this.gvcfSources.stream().map(S -> S.get()).filter(G -> G != null && G.isVariant()).map(G -> GvcfVariant.class.cast(G).ctx).filter(V -> variantContigPosRefComparator.compare(V, archetype) == 0).collect(Collectors.toList());
                    ;
                    final Set<Allele> alleles = allVariants.stream().flatMap(V -> V.getGenotypes().stream()).flatMap(G -> G.getAlleles().stream()).filter(A -> !(A.equals(NON_REF) || A.isNoCall())).collect(Collectors.toSet());
                    alleles.add(archetype.getReference());
                    final VariantContextBuilder vcb = new VariantContextBuilder(getClass().getName(), archetype.getContig(), archetype.getStart(), archetype.getEnd(), alleles);
                    if (archetype.hasID()) {
                        vcb.id(archetype.getID());
                    }
                    final List<Genotype> genotypes = new ArrayList<>(allVariants.size());
                    for (VariantContext ctx : allVariants) {
                        Genotype genotype = ctx.getGenotype(0);
                        GenotypeBuilder gb = new GenotypeBuilder(genotype);
                        genotypes.add(gb.make());
                    }
                    vcb.genotypes(genotypes);
                    final VariantContext genotypedVariant;
                    try {
                        genotypedVariant = vcb.make();
                    } catch (Exception err2) {
                        LOG.debug(allVariants);
                        LOG.error(err2);
                        return -1;
                    }
                    w.add(genotypedVariant);
                    // reset source for the current variant
                    for (final Source src : this.gvcfSources) {
                        if (src.current != null && variantContigPosRefComparator.compare(src.current.ctx, archetype) == 0) {
                            src.current = null;
                        }
                    }
                }
            }
            ++contigTid;
        }
        w.close();
        w = null;
        this.gvcfSources.stream().forEach(S -> S.close());
        this.gvcfSources.clear();
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        for (final Source src : this.gvcfSources) src.close();
        CloserUtil.close(w);
    }
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) SequenceUtil(htsjdk.samtools.util.SequenceUtil) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) IOUtil(htsjdk.samtools.util.IOUtil) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFFileReader(htsjdk.variant.vcf.VCFFileReader) VCFHeader(htsjdk.variant.vcf.VCFHeader) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Interval(htsjdk.samtools.util.Interval) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) PeekableIterator(htsjdk.samtools.util.PeekableIterator) CloserUtil(htsjdk.samtools.util.CloserUtil) VCFConstants(htsjdk.variant.vcf.VCFConstants) Locatable(htsjdk.samtools.util.Locatable) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Set(java.util.Set) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) File(java.io.File) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Comparator(java.util.Comparator) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) Genotype(htsjdk.variant.variantcontext.Genotype) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) File(java.io.File) HashSet(java.util.HashSet)

Example 92 with VariantContextWriter

use of htsjdk.variant.variantcontext.writer.VariantContextWriter in project jvarkit by lindenb.

the class VcfMultiToOne method doWork.

@Override
public int doWork(final List<String> arguments) {
    VariantContextWriter out = null;
    Set<String> args = IOUtils.unrollFiles(arguments);
    List<VcfIterator> inputs = new ArrayList<>(args.size() + 1);
    List<String> inputFiles = new ArrayList<>(args.size() + 1);
    try {
        if (args.isEmpty() && arguments.isEmpty()) {
            inputs.add(VCFUtils.createVcfIteratorStdin());
            inputFiles.add("stdin");
        } else if (args.isEmpty()) {
            LOG.error("No vcf provided");
            return -1;
        } else {
            for (final String vcfFile : args) {
                inputs.add(VCFUtils.createVcfIterator(vcfFile));
                inputFiles.add(VCFUtils.escapeInfoField(vcfFile));
            }
        }
        SAMSequenceDictionary dict = null;
        final Set<String> sampleNames = new HashSet<String>();
        final Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
        for (final VcfIterator in : inputs) {
            final VCFHeader header = in.getHeader();
            if (dict == null) {
                dict = header.getSequenceDictionary();
            } else if (header.getSequenceDictionary() == null) {
                LOG.error("No Dictionary in vcf");
                return -1;
            } else if (!SequenceUtil.areSequenceDictionariesEqual(dict, header.getSequenceDictionary())) {
                LOG.error("Not the same dictionary between vcfs");
                return -1;
            }
            metaData.addAll(in.getHeader().getMetaDataInInputOrder());
            sampleNames.addAll(in.getHeader().getSampleNamesInOrder());
        }
        final Comparator<VariantContext> comparator = (dict == null ? VCFUtils.createChromPosRefComparator() : VCFUtils.createTidPosRefComparator(dict));
        // addMetaData(metaData);
        metaData.add(new VCFInfoHeaderLine(DEFAULT_SAMPLE_TAGID, 1, VCFHeaderLineType.String, "Sample Name from multi-sample vcf"));
        metaData.add(new VCFInfoHeaderLine(DEFAULT_SAMPLE_FILETAGID, 1, VCFHeaderLineType.String, "Origin of sample"));
        for (final String sample : sampleNames) {
            metaData.add(new VCFHeaderLine(SAMPLE_HEADER_DECLARATION, sample));
        }
        final VCFHeader h2 = new VCFHeader(metaData, Collections.singleton(DEFAULT_VCF_SAMPLE_NAME));
        recalculator.setHeader(h2);
        out = super.openVariantContextWriter(this.outputFile);
        out.writeHeader(h2);
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
        for (; ; ) {
            if (out.checkError())
                break;
            /* get 'smallest' variant */
            VariantContext smallest = null;
            int idx = 0;
            int best_idx = -1;
            while (idx < inputs.size()) {
                final VcfIterator in = inputs.get(idx);
                if (!in.hasNext()) {
                    CloserUtil.close(in);
                    inputs.remove(idx);
                    inputFiles.remove(idx);
                } else {
                    final VariantContext ctx = in.peek();
                    if (smallest == null || comparator.compare(smallest, ctx) > 0) {
                        smallest = ctx;
                        best_idx = idx;
                    }
                    ++idx;
                }
            }
            if (smallest == null)
                break;
            final VariantContext ctx = progress.watch(inputs.get(best_idx).next());
            if (ctx.getNSamples() == 0) {
                if (!this.discard_no_call) {
                    final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
                    vcb.attribute(DEFAULT_SAMPLE_FILETAGID, inputFiles.get(best_idx));
                    vcb.genotypes(GenotypeBuilder.createMissing(DEFAULT_VCF_SAMPLE_NAME, 2));
                    out.add(this.recalculator.apply(vcb.make()));
                }
                continue;
            }
            for (int i = 0; i < ctx.getNSamples(); ++i) {
                final Genotype g = ctx.getGenotype(i);
                final String sample = g.getSampleName();
                if (!g.isCalled() && this.discard_no_call)
                    continue;
                if (!g.isAvailable() && this.discard_non_available)
                    continue;
                if (g.isHomRef() && this.discard_hom_ref)
                    continue;
                final GenotypeBuilder gb = new GenotypeBuilder(g);
                gb.name(DEFAULT_VCF_SAMPLE_NAME);
                final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
                vcb.attribute(DEFAULT_SAMPLE_TAGID, sample);
                vcb.attribute(DEFAULT_SAMPLE_FILETAGID, inputFiles.get(best_idx));
                vcb.genotypes(gb.make());
                out.add(this.recalculator.apply(vcb.make()));
            }
        }
        progress.finish();
        LOG.debug("done");
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(inputs);
        CloserUtil.close(out);
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet)

Example 93 with VariantContextWriter

use of htsjdk.variant.variantcontext.writer.VariantContextWriter in project jvarkit by lindenb.

the class VcfGeneSplitter method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, File outputFile) {
    SortingCollection<KeyAndLine> sortingcollection = null;
    BufferedReader in = null;
    FileOutputStream fos = null;
    ZipOutputStream zout = null;
    CloseableIterator<KeyAndLine> iter = null;
    PrintWriter pw = null;
    try {
        in = inputName == null ? IOUtils.openStreamForBufferedReader(stdin()) : IOUtils.openURIForBufferedReading(inputName);
        final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(in);
        /**
         * find splitter by name
         */
        final VepPredictionParser vepPredictionParser = new VepPredictionParserFactory().header(cah.header).get();
        sortingcollection = SortingCollection.newInstance(KeyAndLine.class, new KeyAndLineCodec(), new KeyAndLineComparator(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        sortingcollection.setDestructiveIteration(true);
        // read variants
        final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(cah.header);
        String line;
        while ((line = in.readLine()) != null) {
            final VariantContext ctx = progess.watch(cah.codec.decode(line));
            // no check for ctx.ifFiltered here, we do this later.
            for (final String key : this.getVariantKeys(vepPredictionParser, ctx)) {
                sortingcollection.add(new KeyAndLine(key, line));
            }
        }
        progess.finish();
        sortingcollection.doneAdding();
        LOG.info("creating zip " + outputFile);
        fos = new FileOutputStream(outputFile);
        zout = new ZipOutputStream(fos);
        final File tmpReportFile = File.createTempFile("_tmp.", ".txt", writingSortingCollection.getTmpDirectories().get(0));
        tmpReportFile.deleteOnExit();
        pw = IOUtils.openFileForPrintWriter(tmpReportFile);
        pw.println("#chrom\tstart\tend\tkey\tCount_Variants");
        iter = sortingcollection.iterator();
        final EqualRangeIterator<KeyAndLine> eqiter = new EqualRangeIterator<>(iter, new Comparator<KeyAndLine>() {

            @Override
            public int compare(final KeyAndLine o1, final KeyAndLine o2) {
                return o1.key.compareTo(o2.key);
            }
        });
        while (eqiter.hasNext()) {
            final List<KeyAndLine> buffer = eqiter.next();
            final KeyAndLine first = buffer.get(0);
            LOG.info(first.key);
            final List<VariantContext> variants = new ArrayList<>(buffer.size());
            String contig = null;
            int chromStart = Integer.MAX_VALUE;
            int chromEnd = 0;
            for (final KeyAndLine kal : buffer) {
                final VariantContext ctx = cah.codec.decode(kal.ctx);
                variants.add(ctx);
                contig = ctx.getContig();
                chromStart = Math.min(chromStart, ctx.getStart());
                chromEnd = Math.max(chromEnd, ctx.getEnd());
            }
            pw.println(contig + "\t" + (chromStart - 1) + // -1 for bed compatibility
            "\t" + chromEnd + "\t" + first.key + "\t" + variants.size());
            // save vcf file
            final ZipEntry ze = new ZipEntry(this.baseZipDir + "/" + first.key + ".vcf");
            zout.putNextEntry(ze);
            final VariantContextWriter out = VCFUtils.createVariantContextWriterToOutputStream(IOUtils.uncloseableOutputStream(zout));
            final VCFHeader header2 = addMetaData(new VCFHeader(cah.header));
            header2.addMetaDataLine(new VCFHeaderLine("VcfGeneSplitter.Name", String.valueOf(first.key)));
            out.writeHeader(header2);
            for (final VariantContext ctx : variants) {
                out.add(ctx);
            }
            // yes because wrapped into IOUtils.encloseableOutputSream
            out.close();
            zout.closeEntry();
        }
        eqiter.close();
        iter.close();
        iter = null;
        progess.finish();
        LOG.info("saving report");
        pw.flush();
        pw.close();
        final ZipEntry entry = new ZipEntry(this.baseZipDir + "/manifest.bed");
        zout.putNextEntry(entry);
        IOUtils.copyTo(tmpReportFile, zout);
        zout.closeEntry();
        zout.finish();
        zout.close();
        return RETURN_OK;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        if (sortingcollection != null)
            sortingcollection.cleanup();
        CloserUtil.close(in);
        CloserUtil.close(fos);
        CloserUtil.close(pw);
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) ZipEntry(java.util.zip.ZipEntry) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) PrintWriter(java.io.PrintWriter) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) IOException(java.io.IOException) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) ZipOutputStream(java.util.zip.ZipOutputStream) FileOutputStream(java.io.FileOutputStream) BufferedReader(java.io.BufferedReader) File(java.io.File) VepPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory)

Example 94 with VariantContextWriter

use of htsjdk.variant.variantcontext.writer.VariantContextWriter in project jvarkit by lindenb.

the class VcfSpringFilter method doWork.

@Override
public int doWork(final List<String> args) {
    VcfIterator vcfIn = null;
    VariantContextWriter vcfOut = null;
    try {
        if (this.springCongigFiles.isEmpty()) {
            LOG.error("no spring config file was provided");
            return -1;
        }
        this.springApplicationContext = new FileSystemXmlApplicationContext(springCongigFiles.toArray(new String[springCongigFiles.size()]));
        if (!this.springApplicationContext.containsBean(this.mainBeanName)) {
            LOG.error("cannot get bean " + mainBeanName + " in " + this.springCongigFiles);
            return -1;
        }
        final Object o = this.springApplicationContext.getBean(mainBeanName);
        if (o == null || !(o instanceof VariantContextWriter)) {
            LOG.error("bean " + mainBeanName + " is not a  VcfChain but " + (o == null ? "null" : o.getClass().getName()));
            return -1;
        }
        final VariantContextWriter writer = VariantContextWriter.class.cast(o);
        final String inputFile = oneFileOrNull(args);
        vcfIn = super.openVcfIterator(inputFile);
        return ret;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(vcfIn);
        CloserUtil.close(vcfOut);
        this.springApplicationContext = null;
    }
}
Also used : FileSystemXmlApplicationContext(org.springframework.context.support.FileSystemXmlApplicationContext) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter)

Example 95 with VariantContextWriter

use of htsjdk.variant.variantcontext.writer.VariantContextWriter in project jvarkit by lindenb.

the class MsaToVcf method doWork.

@Override
public int doWork(final List<String> args) {
    VariantContextWriter w = null;
    LineIterator r = null;
    try {
        final String inputName = oneFileOrNull(args);
        if (inputName == null) {
            LOG.info("Reading from stdin");
            r = IOUtils.openStreamForLineIterator(stdin());
        } else {
            LOG.info("Reading from " + inputName);
            r = IOUtils.openURIForLineIterator(inputName);
        }
        Format format = Format.None;
        /**
         * try to guess format
         */
        while (r.hasNext() && format == Format.None) {
            final String line = r.peek();
            if (line.trim().isEmpty()) {
                r.next();
                continue;
            }
            if (line.startsWith("CLUSTAL")) {
                format = Format.Clustal;
                // consume
                r.next();
                break;
            } else if (line.startsWith(">")) {
                format = Format.Fasta;
                break;
            } else {
                LOG.error("MSA format not recognized in " + line);
                return -1;
            }
        }
        LOG.info("format : " + format);
        /**
         * parse lines as FASTA
         */
        if (Format.Fasta.equals(format)) {
            this.consensus = new FastaConsensus();
            AlignSequence curr = null;
            while (r.hasNext()) {
                String line = r.next();
                if (line.startsWith(">")) {
                    curr = new AlignSequence();
                    curr.name = line.substring(1).trim();
                    if (sample2sequence.containsKey(curr.name)) {
                        LOG.error("Sequence ID " + curr.name + " defined twice");
                        return -1;
                    }
                    sample2sequence.put(curr.name, curr);
                } else if (curr != null) {
                    curr.seq.append(line.trim());
                    this.align_length = Math.max(this.align_length, curr.seq.length());
                }
            }
        /*
				//remove heading & trailing '-'
				for(final String sample:this.sample2sequence.keySet())
					{
					final AlignSequence seq = this.sample2sequence.get(sample);
					int i=0;
					while(i<this.align_length && seq.at(i)==DELETION)
						{
						seq.seq.setCharAt(i, CLIPPING);
						++i;
						}
					i= this.align_length-1;
					while(i>=0 && seq.at(i)==DELETION)
						{
						seq.seq.setCharAt(i, CLIPPING);
						--i;
						}
					}*/
        } else /**
         * parse lines as CLUSTAL
         */
        if (Format.Clustal.equals(format)) {
            ClustalConsensus clustalconsensus = new ClustalConsensus();
            this.consensus = clustalconsensus;
            AlignSequence curr = null;
            int columnStart = -1;
            while (r.hasNext()) {
                String line = r.next();
                if (line.trim().isEmpty() || line.startsWith("CLUSTAL W")) {
                    columnStart = -1;
                    continue;
                }
                if (line.charAt(0) == ' ') {
                    if (columnStart == -1) {
                        LOG.error("illegal consensus line for " + line);
                        return -1;
                    }
                    /* if consensus doesn't exist in the first rows */
                    while (clustalconsensus.seq.length() < (this.align_length - (line.length() - columnStart))) {
                        clustalconsensus.seq.append(" ");
                    }
                    clustalconsensus.seq.append(line.substring(columnStart));
                } else {
                    if (columnStart == -1) {
                        columnStart = line.indexOf(' ');
                        if (columnStart == -1) {
                            LOG.error("no whithespace in " + line);
                            return -1;
                        }
                        while (columnStart < line.length() && line.charAt(columnStart) == ' ') {
                            columnStart++;
                        }
                    }
                    String seqname = line.substring(0, columnStart).trim();
                    curr = this.sample2sequence.get(seqname);
                    if (curr == null) {
                        curr = new AlignSequence();
                        curr.name = seqname;
                        this.sample2sequence.put(curr.name, curr);
                    }
                    int columnEnd = line.length();
                    // remove blanks and digit at the end
                    while (columnEnd - 1 > columnStart && (line.charAt(columnEnd - 1) == ' ' || Character.isDigit(line.charAt(columnEnd - 1)))) {
                        columnEnd--;
                    }
                    curr.seq.append(line.substring(columnStart, columnEnd));
                    this.align_length = Math.max(align_length, curr.seq.length());
                }
            }
        } else {
            LOG.error("Undefined input format");
            return -1;
        }
        CloserUtil.close(r);
        /* sequence consensus was set*/
        if (consensusRefName != null) {
            AlignSequence namedSequence = null;
            if ((namedSequence = sample2sequence.get(consensusRefName)) == null) {
                LOG.error("Cannot find consensus sequence \"" + consensusRefName + "\" in list of sequences: " + this.sample2sequence.keySet().toString());
                return -1;
            }
            this.consensus = new NamedConsensus(namedSequence);
        }
        /**
         * we're done, print VCF
         */
        /**
         * first, print header
         */
        Set<VCFHeaderLine> vcfHeaderLines = new HashSet<VCFHeaderLine>();
        vcfHeaderLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth."));
        vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
        vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth"));
        // super.addMetaData(vcfHeaderLines);
        Map<String, String> mapping = new HashMap<String, String>();
        mapping.put("ID", REF);
        mapping.put("length", String.valueOf(this.align_length));
        vcfHeaderLines.add(new VCFContigHeaderLine(mapping, 1));
        Set<String> samples = new TreeSet<String>(this.sample2sequence.keySet());
        VCFHeader vcfHeader = new VCFHeader(vcfHeaderLines, samples);
        w = super.openVariantContextWriter(this.outputFile);
        w.writeHeader(vcfHeader);
        /**
         * loop over data, print header
         */
        int pos1 = 0;
        while (pos1 < align_length) {
            // is it a real variation or print all sites
            boolean is_variation;
            if (consensus.at(pos1) == MATCH) {
                if (this.printAllSites) {
                    is_variation = false;
                } else {
                    ++pos1;
                    continue;
                }
            } else {
                is_variation = true;
            }
            int pos2 = pos1 + 1;
            // don't extend if no variation and printAllSites
            while (is_variation && pos2 < align_length && consensus.at(pos2) != MATCH) {
                ++pos2;
            }
            boolean is_subsitution = (pos1 + 1 == pos2);
            if (// need pos1>0 because ALT contains prev base.
            is_subsitution && pos1 != 0 && is_variation) {
                for (Sequence seq : this.sample2sequence.values()) {
                    if (seq.at(pos1) == DELETION) {
                        is_subsitution = false;
                        break;
                    }
                }
            }
            Set<Allele> alleles = new HashSet<Allele>();
            VariantContextBuilder vcb = new VariantContextBuilder();
            List<Genotype> genotypes = new ArrayList<Genotype>(samples.size());
            /* longest variant */
            String longest = null;
            Counter<String> countAlleles = new Counter<String>();
            Map<String, String> sample2genotype = new HashMap<String, String>(samples.size());
            String namedConsensusRefAllele = "N";
            /* loop over the sequences of each seample */
            for (String sample : samples) {
                Sequence seq = this.sample2sequence.get(sample);
                String al = null;
                if (is_subsitution) {
                    if (seq.at(pos1) == CLIPPING)
                        continue;
                    al = String.valueOf(seq.at(pos1));
                } else {
                    StringBuilder sb = new StringBuilder(pos2 - pos1);
                    for (// yes -1
                    int i = Math.max(0, pos1 - 1); i < pos2; ++i) {
                        if (seq.at(i) == CLIPPING)
                            continue;
                        if (seq.at(i) == DELETION)
                            continue;
                        sb.append(seq.at(i));
                    }
                    if (sb.length() == 0)
                        continue;
                    al = sb.toString();
                }
                /* did we find the longest allele ?*/
                if (longest == null || longest.length() < al.length()) {
                    // reset count of most frequent, we'll use the longest indel or subst
                    countAlleles = new Counter<String>();
                    longest = al;
                }
                countAlleles.incr(al);
                sample2genotype.put(sample, al);
                /* if consensus sequence name was defined , record this allele for future use */
                if (consensusRefName != null && sample.equals(consensusRefName)) {
                    namedConsensusRefAllele = al;
                }
            }
            if (countAlleles.isEmpty()) {
                if (// printAllSites=false
                printAllSites == false) {
                    continue;
                }
                /* no a real variation, just add a dummy 'N' */
                countAlleles.incr("N");
            }
            String refAllStr;
            if (consensusRefName == null) {
                refAllStr = countAlleles.getMostFrequent();
            } else {
                refAllStr = namedConsensusRefAllele;
            }
            final Allele refAllele = Allele.create(refAllStr.replaceAll("[^ATGCatgc]", "N"), true);
            alleles.add(refAllele);
            /* loop over samples, , build each genotype */
            for (String sample : sample2genotype.keySet()) {
                Allele al = null;
                if (!sample2genotype.containsKey(sample)) {
                // nothing
                } else if (sample2genotype.get(sample).equals(refAllStr)) {
                    al = refAllele;
                } else {
                    al = Allele.create(sample2genotype.get(sample).replaceAll("[^ATGCatgc]", "N"), false);
                    alleles.add(al);
                }
                if (al != null) {
                    final GenotypeBuilder gb = new GenotypeBuilder(sample);
                    final List<Allele> sampleAlleles = new ArrayList<Allele>(2);
                    sampleAlleles.add(al);
                    if (!haploid)
                        sampleAlleles.add(al);
                    gb.alleles(sampleAlleles);
                    gb.DP(1);
                    genotypes.add(gb.make());
                } else {
                    genotypes.add(GenotypeBuilder.createMissing(sample, haploid ? 1 : 2));
                }
            }
            // got to 1-based ref if subst, for indel with use pos(base)-1
            final int start = pos1 + (is_subsitution ? 1 : 0);
            vcb.start(start);
            vcb.stop(start + (refAllStr.length() - 1));
            vcb.chr(REF);
            HashMap<String, Object> atts = new HashMap<String, Object>();
            atts.put(VCFConstants.DEPTH_KEY, genotypes.size());
            vcb.attributes(atts);
            vcb.alleles(alleles);
            vcb.genotypes(genotypes);
            w.add(vcb.make());
            pos1 = pos2;
        }
        w.close();
        if (outFasta != null) {
            final PrintWriter fasta = super.openFileOrStdoutAsPrintWriter(outFasta);
            for (final String sample : samples) {
                fasta.println(">" + sample);
                final Sequence seq = this.sample2sequence.get(sample);
                for (int i = 0; i < align_length; ++i) {
                    fasta.print(seq.at(i));
                }
                fasta.println();
            }
            fasta.println(">CONSENSUS");
            for (int i = 0; i < align_length; ++i) {
                fasta.print(consensus.at(i));
            }
            fasta.println();
            fasta.flush();
            fasta.close();
        }
        LOG.info("Done");
        return RETURN_OK;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(r);
        CloserUtil.close(w);
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) LineIterator(htsjdk.tribble.readers.LineIterator) VCFContigHeaderLine(htsjdk.variant.vcf.VCFContigHeaderLine) Counter(com.github.lindenb.jvarkit.util.Counter) TreeSet(java.util.TreeSet) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) PrintWriter(java.io.PrintWriter) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder)

Aggregations

VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)132 File (java.io.File)67 VCFHeader (htsjdk.variant.vcf.VCFHeader)65 VariantContext (htsjdk.variant.variantcontext.VariantContext)60 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)53 ArrayList (java.util.ArrayList)41 IOException (java.io.IOException)38 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)35 VCFHeaderLine (htsjdk.variant.vcf.VCFHeaderLine)34 HashSet (java.util.HashSet)32 List (java.util.List)29 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)28 Allele (htsjdk.variant.variantcontext.Allele)28 Collectors (java.util.stream.Collectors)27 VcfIterator (com.github.lindenb.jvarkit.util.vcf.VcfIterator)26 VCFFileReader (htsjdk.variant.vcf.VCFFileReader)26 Parameter (com.beust.jcommander.Parameter)24 Logger (com.github.lindenb.jvarkit.util.log.Logger)24 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)23 Program (com.github.lindenb.jvarkit.util.jcommander.Program)23