Search in sources :

Example 81 with SAMSequenceDictionaryProgress

use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.

the class VcfBurden method doWork2.

private int doWork2(List<String> args) {
    ZipOutputStream zout = null;
    FileOutputStream fout = null;
    VcfIterator in = null;
    try {
        if (args.isEmpty()) {
            LOG.info("reading from stdin.");
            in = VCFUtils.createVcfIteratorStdin();
        } else if (args.size() == 1) {
            String filename = args.get(0);
            LOG.info("reading from " + filename);
            in = VCFUtils.createVcfIterator(filename);
        } else {
            LOG.error("Illegal number of arguments.");
            return -1;
        }
        if (outputFile == null) {
            LOG.error("undefined output");
            return -1;
        } else if (!outputFile.getName().endsWith(".zip")) {
            LOG.error("output " + outputFile + " should end with .zip");
            return -1;
        } else {
            fout = new FileOutputStream(outputFile);
            zout = new ZipOutputStream(fout);
        }
        final List<String> samples = in.getHeader().getSampleNamesInOrder();
        final VCFHeader header = in.getHeader();
        String prev_chrom = null;
        final VepPredictionParser vepPredParser = new VepPredictionParserFactory().header(header).get();
        final Map<GeneTranscript, List<VariantAndCsq>> gene2variants = new HashMap<>();
        final SequenceOntologyTree soTree = SequenceOntologyTree.getInstance();
        final Set<SequenceOntologyTree.Term> acn = new HashSet<>();
        /* mail solena  *SO en remplacement des SO actuels (VEP HIGH + MODERATE) - pas la peine de faire retourner les analyses mais servira pour les futures analyses burden */
        String[] acn_list = new String[] { "SO:0001893", "SO:0001574", "SO:0001575", "SO:0001587", "SO:0001589", "SO:0001578", "SO:0002012", "SO:0001889", "SO:0001821", "SO:0001822", "SO:0001583", "SO:0001818" /*
					"SO:0001589", "SO:0001587", "SO:0001582", "SO:0001583",
					"SO:0001575", "SO:0001578", "SO:0001574", "SO:0001889",
					"SO:0001821", "SO:0001822", "SO:0001893"*/
        };
        if (this.highdamage) {
            acn_list = new String[] { "SO:0001893", "SO:0001574", "SO:0001575", "SO:0001587", "SO:0001589", "SO:0001578", "SO:0002012", "SO:0001889" };
        }
        for (final String acns : acn_list) {
            final SequenceOntologyTree.Term tacn = soTree.getTermByAcn(acns);
            if (tacn == null) {
                in.close();
                throw new NullPointerException("tacn == null pour " + acns);
            }
            acn.addAll(tacn.getAllDescendants());
        }
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(in.getHeader());
        for (; ; ) {
            VariantContext ctx1 = null;
            if (in.hasNext()) {
                ctx1 = progress.watch(in.next());
                if (ctx1.getAlternateAlleles().size() != 1) {
                    // info("count(ALT)!=1 in "+ctx1.getChr()+":"+ctx1.getStart());
                    continue;
                }
                if (ctx1.getAlternateAlleles().get(0).equals(Allele.SPAN_DEL)) {
                    continue;
                }
            }
            if (ctx1 == null || !ctx1.getContig().equals(prev_chrom)) {
                LOG.info("DUMP to zip n=" + gene2variants.size());
                final Set<String> geneNames = new HashSet<>();
                for (GeneTranscript gene_transcript : gene2variants.keySet()) {
                    geneNames.add(gene_transcript.geneName);
                    final Set<VariantAndCsq> uniq = new TreeSet<>(this.variantAndCsqComparator);
                    uniq.addAll(gene2variants.get(gene_transcript));
                    dumpVariants(zout, prev_chrom, gene_transcript.geneName + "_" + gene_transcript.transcriptName, samples, new ArrayList<VariantAndCsq>(uniq));
                    LOG.info("dumped" + gene_transcript.geneName);
                }
                LOG.info("loop over geneName");
                for (final String geneName : geneNames) {
                    final SortedSet<VariantAndCsq> lis_nm = new TreeSet<>(this.variantAndCsqComparator);
                    final SortedSet<VariantAndCsq> lis_all = new TreeSet<>(this.variantAndCsqComparator);
                    final SortedSet<VariantAndCsq> lis_refseq = new TreeSet<>(this.variantAndCsqComparator);
                    final SortedSet<VariantAndCsq> lis_enst = new TreeSet<>(this.variantAndCsqComparator);
                    LOG.info("loop over gene2variants");
                    for (final GeneTranscript gene_transcript : gene2variants.keySet()) {
                        if (!geneName.equals(gene_transcript.geneName))
                            continue;
                        lis_all.addAll(gene2variants.get(gene_transcript));
                        if (gene_transcript.transcriptName.startsWith("NM_")) {
                            lis_nm.addAll(gene2variants.get(gene_transcript));
                        }
                        if (!gene_transcript.transcriptName.startsWith("ENST")) {
                            lis_refseq.addAll(gene2variants.get(gene_transcript));
                        }
                        if (gene_transcript.transcriptName.startsWith("ENST")) {
                            lis_enst.addAll(gene2variants.get(gene_transcript));
                        }
                    }
                    LOG.info("dump_ALL_TRANSCRIPTS");
                    dumpVariants(zout, prev_chrom, geneName + "_ALL_TRANSCRIPTS", samples, new ArrayList<VariantAndCsq>(lis_all));
                    LOG.info("dump_ALL_NM");
                    dumpVariants(zout, prev_chrom, geneName + "_ALL_NM", samples, new ArrayList<VariantAndCsq>(lis_nm));
                    LOG.info("dump_ALL_REFSEQ");
                    dumpVariants(zout, prev_chrom, geneName + "_ALL_REFSEQ", samples, new ArrayList<VariantAndCsq>(lis_refseq));
                    LOG.info("dump_ALL_ENST");
                    dumpVariants(zout, prev_chrom, geneName + "_ALL_ENST", samples, new ArrayList<VariantAndCsq>(lis_enst));
                }
                if (ctx1 == null)
                    break;
                LOG.info("gene2variants");
                gene2variants.clear();
                LOG.info("System.gc();");
                System.gc();
                prev_chrom = ctx1.getContig();
                LOG.info("prev_chrom=" + prev_chrom);
            }
            final Set<GeneTranscript> seen_names = new HashSet<>();
            for (final VepPredictionParser.VepPrediction pred : vepPredParser.getPredictions(ctx1)) {
                String geneName = pred.getSymbol();
                if (geneName == null || geneName.trim().isEmpty())
                    continue;
                if (this._gene2seen != null) {
                    if (!this._gene2seen.containsKey(geneName))
                        continue;
                }
                final String transcriptName = pred.getFeature();
                if (transcriptName == null || transcriptName.trim().isEmpty()) {
                    LOG.info("No transcript in " + ctx1);
                    continue;
                }
                final GeneTranscript geneTranscript = new GeneTranscript(geneName, transcriptName);
                if (seen_names.contains(geneTranscript))
                    continue;
                boolean ok = false;
                for (SequenceOntologyTree.Term so : pred.getSOTerms()) {
                    if (acn.contains(so)) {
                        ok = true;
                    }
                }
                if (!printSOTerms && !ok)
                    continue;
                List<VariantAndCsq> L = gene2variants.get(geneTranscript);
                if (L == null) {
                    L = new ArrayList<>();
                    gene2variants.put(geneTranscript, L);
                }
                Float vqslod = null;
                if (this.printVQSLOD && ctx1.hasAttribute("VQSLOD")) {
                    vqslod = (float) ctx1.getAttributeAsDouble("VQSLOD", -9999999.0);
                }
                L.add(new VariantAndCsq(ctx1, pred.getSOTerms(), pred.getPositionInCDS(), vqslod));
                seen_names.add(geneTranscript);
                if (this._gene2seen != null) {
                    this._gene2seen.put(geneTranscript.geneName, Boolean.TRUE);
                }
            }
        }
        if (this._gene2seen != null) {
            final List<VariantAndCsq> emptylist = Collections.emptyList();
            for (final String gene : this._gene2seen.keySet()) {
                if (this._gene2seen.get(gene).equals(Boolean.TRUE))
                    continue;
                LOG.warning("Gene not found : " + gene);
                dumpVariants(zout, "UNDEFINED", gene + "_000000000000000.txt", samples, emptylist);
            }
        }
        progress.finish();
        zout.finish();
        fout.flush();
        zout.flush();
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(in);
        CloserUtil.close(zout);
        CloserUtil.close(fout);
    }
}
Also used : HashMap(java.util.HashMap) VariantContext(htsjdk.variant.variantcontext.VariantContext) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) List(java.util.List) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) IOException(java.io.IOException) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) ZipOutputStream(java.util.zip.ZipOutputStream) FileOutputStream(java.io.FileOutputStream) SequenceOntologyTree(com.github.lindenb.jvarkit.util.so.SequenceOntologyTree) VepPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory)

Example 82 with SAMSequenceDictionaryProgress

use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.

the class VcfCadd method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
    try {
        VCFHeader header = in.getHeader();
        if (header.getSequenceDictionary() != null) {
            SAMSequenceDictionary dict = header.getSequenceDictionary();
            Set<String> vcfchr = new HashSet<String>();
            for (SAMSequenceRecord ssr : dict.getSequences()) vcfchr.add(ssr.getSequenceName());
            if (// nothing changed
            !vcfchr.retainAll(this.tabix.getChromosomes())) {
                LOG.warning("#### !!!! NO common chromosomes between tabix and vcf file. Check chromosome 'chr' prefix ? tabix chroms:" + this.tabix.getChromosomes());
            }
        }
        SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header.getSequenceDictionary());
        header.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
        header.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
        header.addMetaDataLine(new VCFInfoHeaderLine(CADD_FLAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "(Allele|Score|Phred) Score suggests that that variant is likely to be  observed (negative values) vs simulated(positive values)." + "However, raw values do have relative meaning, with higher values indicating that a variant is more likely to be simulated (or -not observed-) and therefore more likely to have deleterious effects." + "PHRED expressing the rank in order of magnitude terms. For example, reference genome single nucleotide variants at the 10th-% of CADD scores are assigned to CADD-10, top 1% to CADD-20, top 0.1% to CADD-30, etc"));
        out.writeHeader(header);
        List<VariantContext> buffer = new ArrayList<>();
        for (; ; ) {
            VariantContext ctx = null;
            if (in.hasNext()) {
                ctx = progress.watch(in.next());
            }
            if (ctx == null || (!buffer.isEmpty() && (buffer.get(0).getContig().equals(ctx.getContig())) && (ctx.getEnd() - buffer.get(0).getStart()) > buffer_distance)) {
                if (!buffer.isEmpty()) {
                    runTabix(buffer);
                    for (VariantContext c : buffer) {
                        out.add(c);
                    }
                }
                if (ctx == null)
                    break;
                buffer.clear();
            }
            buffer.add(ctx);
        }
        progress.finish();
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) IOException(java.io.IOException) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet)

Example 83 with SAMSequenceDictionaryProgress

use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.

the class VcfGeneSplitter method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, File outputFile) {
    SortingCollection<KeyAndLine> sortingcollection = null;
    BufferedReader in = null;
    FileOutputStream fos = null;
    ZipOutputStream zout = null;
    CloseableIterator<KeyAndLine> iter = null;
    PrintWriter pw = null;
    try {
        in = inputName == null ? IOUtils.openStreamForBufferedReader(stdin()) : IOUtils.openURIForBufferedReading(inputName);
        final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(in);
        /**
         * find splitter by name
         */
        final VepPredictionParser vepPredictionParser = new VepPredictionParserFactory().header(cah.header).get();
        sortingcollection = SortingCollection.newInstance(KeyAndLine.class, new KeyAndLineCodec(), new KeyAndLineComparator(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        sortingcollection.setDestructiveIteration(true);
        // read variants
        final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(cah.header);
        String line;
        while ((line = in.readLine()) != null) {
            final VariantContext ctx = progess.watch(cah.codec.decode(line));
            // no check for ctx.ifFiltered here, we do this later.
            for (final String key : this.getVariantKeys(vepPredictionParser, ctx)) {
                sortingcollection.add(new KeyAndLine(key, line));
            }
        }
        progess.finish();
        sortingcollection.doneAdding();
        LOG.info("creating zip " + outputFile);
        fos = new FileOutputStream(outputFile);
        zout = new ZipOutputStream(fos);
        final File tmpReportFile = File.createTempFile("_tmp.", ".txt", writingSortingCollection.getTmpDirectories().get(0));
        tmpReportFile.deleteOnExit();
        pw = IOUtils.openFileForPrintWriter(tmpReportFile);
        pw.println("#chrom\tstart\tend\tkey\tCount_Variants");
        iter = sortingcollection.iterator();
        final EqualRangeIterator<KeyAndLine> eqiter = new EqualRangeIterator<>(iter, new Comparator<KeyAndLine>() {

            @Override
            public int compare(final KeyAndLine o1, final KeyAndLine o2) {
                return o1.key.compareTo(o2.key);
            }
        });
        while (eqiter.hasNext()) {
            final List<KeyAndLine> buffer = eqiter.next();
            final KeyAndLine first = buffer.get(0);
            LOG.info(first.key);
            final List<VariantContext> variants = new ArrayList<>(buffer.size());
            String contig = null;
            int chromStart = Integer.MAX_VALUE;
            int chromEnd = 0;
            for (final KeyAndLine kal : buffer) {
                final VariantContext ctx = cah.codec.decode(kal.ctx);
                variants.add(ctx);
                contig = ctx.getContig();
                chromStart = Math.min(chromStart, ctx.getStart());
                chromEnd = Math.max(chromEnd, ctx.getEnd());
            }
            pw.println(contig + "\t" + (chromStart - 1) + // -1 for bed compatibility
            "\t" + chromEnd + "\t" + first.key + "\t" + variants.size());
            // save vcf file
            final ZipEntry ze = new ZipEntry(this.baseZipDir + "/" + first.key + ".vcf");
            zout.putNextEntry(ze);
            final VariantContextWriter out = VCFUtils.createVariantContextWriterToOutputStream(IOUtils.uncloseableOutputStream(zout));
            final VCFHeader header2 = addMetaData(new VCFHeader(cah.header));
            header2.addMetaDataLine(new VCFHeaderLine("VcfGeneSplitter.Name", String.valueOf(first.key)));
            out.writeHeader(header2);
            for (final VariantContext ctx : variants) {
                out.add(ctx);
            }
            // yes because wrapped into IOUtils.encloseableOutputSream
            out.close();
            zout.closeEntry();
        }
        eqiter.close();
        iter.close();
        iter = null;
        progess.finish();
        LOG.info("saving report");
        pw.flush();
        pw.close();
        final ZipEntry entry = new ZipEntry(this.baseZipDir + "/manifest.bed");
        zout.putNextEntry(entry);
        IOUtils.copyTo(tmpReportFile, zout);
        zout.closeEntry();
        zout.finish();
        zout.close();
        return RETURN_OK;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        if (sortingcollection != null)
            sortingcollection.cleanup();
        CloserUtil.close(in);
        CloserUtil.close(fos);
        CloserUtil.close(pw);
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) ZipEntry(java.util.zip.ZipEntry) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) PrintWriter(java.io.PrintWriter) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) IOException(java.io.IOException) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) ZipOutputStream(java.util.zip.ZipOutputStream) FileOutputStream(java.io.FileOutputStream) BufferedReader(java.io.BufferedReader) File(java.io.File) VepPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory)

Example 84 with SAMSequenceDictionaryProgress

use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.

the class VcfGroupByPopulation method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, VcfIterator vcfIn, VariantContextWriter out) {
    try {
        Reader r = IOUtils.openFileForBufferedReading(this.mappingFile);
        parsePopulationMapping(r);
        r.close();
        VCFHeader header = vcfIn.getHeader();
        Set<String> samplesInVcf = new HashSet<>(header.getSampleNamesInOrder());
        this.sample2population.keySet().retainAll(samplesInVcf);
        Map<String, Set<String>> population2samples = new HashMap<>();
        for (String sample : this.sample2population.keySet()) {
            String pop = this.sample2population.get(sample);
            Set<String> samples = population2samples.get(pop);
            if (samples == null) {
                samples = new HashSet<>();
                population2samples.put(pop, samples);
            }
            samples.add(sample);
        }
        for (String sample : header.getSampleNamesInOrder()) {
            if (!this.sample2population.containsKey(sample)) {
                throw new IOException("Sample " + sample + " not affected to a population");
            }
        }
        Set<VCFHeaderLine> metaData = new LinkedHashSet<>();
        for (VCFHeaderLine vhl : header.getMetaDataInInputOrder()) {
            if (!(vhl instanceof VCFContigHeaderLine))
                continue;
            metaData.add(vhl);
        }
        metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
        metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
        metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkVersion", HtsjdkVersion.getVersion()));
        metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkHome", HtsjdkVersion.getHome()));
        /* FORMAT */
        metaData.add(new VCFFormatHeaderLine("NS", 1, VCFHeaderLineType.Integer, "Total Number of Samples"));
        metaData.add(new VCFFormatHeaderLine("R", 1, VCFHeaderLineType.Integer, "Number of alleles REF (hom:=2,het:=1)"));
        metaData.add(new VCFFormatHeaderLine("A", 1, VCFHeaderLineType.Integer, "Number of alleles ALT (hom:=2,het:=1)"));
        metaData.add(new VCFFormatHeaderLine("UNC", 1, VCFHeaderLineType.Integer, "Number of NON-called samples"));
        metaData.add(new VCFFormatHeaderLine("F", 1, VCFHeaderLineType.Float, "Allele Frequency A/(R+A)"));
        metaData.add(new VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Depth"));
        /* INFO */
        metaData.add(new VCFInfoHeaderLine("NS", 1, VCFHeaderLineType.Integer, "Total Number of Samples"));
        metaData.add(new VCFInfoHeaderLine("R", 1, VCFHeaderLineType.Integer, "Number of alleles REF (hom:=2,het:=1)"));
        metaData.add(new VCFInfoHeaderLine("A", 1, VCFHeaderLineType.Integer, "Number of alleles ALT (hom:=2,het:=1)"));
        metaData.add(new VCFInfoHeaderLine("UNC", 1, VCFHeaderLineType.Integer, "Number of NON-called samples"));
        metaData.add(new VCFInfoHeaderLine("F", 1, VCFHeaderLineType.Float, "Allele Frequency A/(R+A)"));
        metaData.add(new VCFInfoHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Depth"));
        metaData.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype"));
        VCFHeader h2 = new VCFHeader(metaData, population2samples.keySet());
        out.writeHeader(h2);
        SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(vcfIn.getHeader());
        while (vcfIn.hasNext()) {
            VariantContext ctx = progress.watch(vcfIn.next());
            VariantContextBuilder vcb = new VariantContextBuilder(ctx.getSource(), ctx.getContig(), ctx.getStart(), ctx.getEnd(), ctx.getAlleles());
            if (ctx.hasID())
                vcb.id(ctx.getID());
            GCount count_ctx = new GCount();
            List<Genotype> genotypes = new ArrayList<>(population2samples.size());
            for (String pop : population2samples.keySet()) {
                GCount count = new GCount();
                Set<String> samples = population2samples.get(pop);
                for (String sample : samples) {
                    Genotype g = ctx.getGenotype(sample);
                    count.watch(g);
                }
                GenotypeBuilder gb = new GenotypeBuilder(pop);
                gb.attribute("NS", samples.size());
                gb.attribute("R", count.R);
                gb.attribute("A", count.A);
                gb.attribute("UNC", count.uncalled);
                if (count.R + count.A > 0) {
                    gb.attribute("F", (float) count.A / (float) (count.R + count.A));
                }
                if (count.dp >= 0) {
                    gb.attribute("DP", count.dp);
                    if (count_ctx.dp == -1)
                        count_ctx.dp = 0;
                }
                genotypes.add(gb.make());
                count_ctx.R += count.R;
                count_ctx.A += count.A;
                count_ctx.uncalled += count.uncalled;
                count_ctx.dp += count.dp;
            }
            vcb.attribute("R", count_ctx.R);
            vcb.attribute("A", count_ctx.A);
            vcb.attribute("UNC", count_ctx.uncalled);
            if (count_ctx.R + count_ctx.A > 0) {
                vcb.attribute("F", (float) count_ctx.A / (float) (count_ctx.R + count_ctx.A));
            }
            if (count_ctx.dp >= 0) {
                vcb.attribute("DP", count_ctx.dp);
            }
            vcb.attribute("NS", this.sample2population.keySet().size());
            vcb.genotypes(genotypes);
            out.add(vcb.make());
        }
        progress.finish();
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    }
}
Also used : LinkedHashSet(java.util.LinkedHashSet) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) Set(java.util.Set) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Reader(java.io.Reader) BufferedReader(java.io.BufferedReader) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFContigHeaderLine(htsjdk.variant.vcf.VCFContigHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) Genotype(htsjdk.variant.variantcontext.Genotype) IOException(java.io.IOException) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) IOException(java.io.IOException) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder)

Example 85 with SAMSequenceDictionaryProgress

use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.

the class LocalRealignReads method doWork.

@Override
public int doWork(List<String> args) {
    if (this.faidxFile == null) {
        LOG.error("REFerence file missing;");
        return -1;
    }
    IndexedFastaSequenceFile indexedFastaSequenceFile = null;
    SamReader samReader = null;
    SAMFileWriter w = null;
    SAMRecordIterator iter = null;
    GenomicSequence genomicSequence = null;
    LongestCommonSequence matrix = new LongestCommonSequence();
    try {
        indexedFastaSequenceFile = new IndexedFastaSequenceFile(this.faidxFile);
        samReader = openSamReader(oneFileOrNull(args));
        final SAMFileHeader header1 = samReader.getFileHeader();
        final SAMFileHeader header2 = header1.clone();
        header2.setSortOrder(SortOrder.unsorted);
        w = this.writingBamArgs.setReferenceFile(faidxFile).openSAMFileWriter(outputFile, header2, true);
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1);
        iter = samReader.iterator();
        while (iter.hasNext()) {
            final SAMRecord rec = progress.watch(iter.next());
            if (rec.getReadUnmappedFlag() || rec.isSecondaryOrSupplementary() || rec.getReadFailsVendorQualityCheckFlag() || rec.getDuplicateReadFlag()) {
                w.addAlignment(rec);
                continue;
            }
            final Cigar cigar = rec.getCigar();
            final int nCigarElement = cigar.numCigarElements();
            if (nCigarElement < 2) {
                w.addAlignment(rec);
                continue;
            }
            final CigarElement ce5 = cigar.getCigarElement(0);
            final CigarElement ce3 = cigar.getCigarElement(nCigarElement - 1);
            if (!((ce3.getOperator().equals(CigarOperator.S) && ce3.getLength() >= MIN_ALIGN_LEN) || (ce5.getOperator().equals(CigarOperator.S) && ce5.getLength() >= MIN_ALIGN_LEN))) {
                w.addAlignment(rec);
                continue;
            }
            if (genomicSequence == null || !genomicSequence.getChrom().equals(rec.getReferenceName())) {
                genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
            }
            final CharSequence readseq = rec.getReadString();
            /**
             * short invert
             */
            if (ce5.getOperator() == CigarOperator.S && ce5.getLength() >= MIN_ALIGN_LEN) {
                CharSequence clipseq = new SubSequence(readseq, 0, ce5.getLength());
                CharSequence revcomp = new RevCompCharSequence(clipseq);
                LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
                if (hit.size() >= MIN_ALIGN_LEN) {
                    System.err.println("REVCOMP5' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName() + " " + rec.getCigarString());
                }
            /*
					
					hit = matrix.align(
							readseq, 0, readseq.length(),
							revcomp,
							0,
							revcomp.length()
							);
					if(hit.size()>=MIN_ALIGN_LEN)
						{
						System.err.println("REVCOMP5' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
						}
					*/
            }
            if (ce3.getOperator() == CigarOperator.S && ce3.getLength() >= MIN_ALIGN_LEN) {
                CharSequence clipseq = new SubSequence(readseq, readseq.length() - ce3.getLength(), readseq.length());
                CharSequence revcomp = new RevCompCharSequence(clipseq);
                LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
                if (hit.size() >= MIN_ALIGN_LEN) {
                    System.err.println("REVCOMP3' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName());
                }
            /*
					hit = matrix.align(
							readseq, 0, readseq.length(),
							revcomp,
							0,
							revcomp.length()
							);
					if(hit.size()>=MIN_ALIGN_LEN)
						{
						System.err.println("REVCOMP3' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
						}*/
            }
            /* other */
            List<LongestCommonSequence.Hit> hits = new ArrayList<>();
            align(hits, matrix, genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), readseq, 0, readseq.length(), -1);
            if (hits.size() < 2000)
                continue;
            for (LongestCommonSequence.Hit hit : hits) {
                int readstart = 0;
                boolean in_M = false;
                for (int i = 0; i < nCigarElement; ++i) {
                    final CigarElement ce = cigar.getCigarElement(i);
                    if (ce.getOperator().consumesReadBases()) {
                        int readend = readstart + ce.getLength();
                        if (ce.getOperator() == CigarOperator.M || ce.getOperator() == CigarOperator.X || ce.getOperator() == CigarOperator.EQ) {
                            if (!(hit.getEndY() <= readstart || readend <= hit.getStartY())) {
                                in_M = true;
                                break;
                            }
                        }
                        readstart = readend;
                    }
                }
                if (in_M)
                    continue;
                int align_size = hit.size();
                System.err.println(rec.getReadName() + " " + rec.getCigarString() + " " + rec.getAlignmentStart() + "-" + rec.getAlignmentEnd());
                System.err.println("REF: " + hit.getStartX() + "-" + hit.getEndX());
                System.err.println("READ " + hit.getStartY() + "-" + hit.getEndY());
                System.err.print("REF  :");
                for (int i = 0; i < align_size; ++i) {
                    System.err.print(genomicSequence.charAt(hit.getStartX() + i));
                }
                System.err.println();
                System.err.print("READ :");
                for (int i = 0; i < align_size; ++i) {
                    System.err.print(readseq.charAt(hit.getStartY() + i));
                }
                System.err.println();
            }
            System.err.println();
        }
        progress.finish();
        return RETURN_OK;
    } catch (Exception e) {
        return wrapException(e);
    } finally {
        genomicSequence = null;
        CloserUtil.close(iter);
        CloserUtil.close(samReader);
        CloserUtil.close(w);
        CloserUtil.close(indexedFastaSequenceFile);
        matrix = null;
    }
}
Also used : SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SAMFileWriter(htsjdk.samtools.SAMFileWriter) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) RevCompCharSequence(com.github.lindenb.jvarkit.util.bio.RevCompCharSequence) ArrayList(java.util.ArrayList) LongestCommonSequence(com.github.lindenb.jvarkit.util.align.LongestCommonSequence) CigarElement(htsjdk.samtools.CigarElement) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) SamReader(htsjdk.samtools.SamReader) Cigar(htsjdk.samtools.Cigar) SubSequence(com.github.lindenb.jvarkit.lang.SubSequence) SAMRecord(htsjdk.samtools.SAMRecord) RevCompCharSequence(com.github.lindenb.jvarkit.util.bio.RevCompCharSequence) SAMFileHeader(htsjdk.samtools.SAMFileHeader)

Aggregations

SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)146 ArrayList (java.util.ArrayList)64 VariantContext (htsjdk.variant.variantcontext.VariantContext)59 VCFHeader (htsjdk.variant.vcf.VCFHeader)57 SAMRecord (htsjdk.samtools.SAMRecord)54 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)54 SAMRecordIterator (htsjdk.samtools.SAMRecordIterator)48 IOException (java.io.IOException)48 File (java.io.File)47 SamReader (htsjdk.samtools.SamReader)40 SAMFileHeader (htsjdk.samtools.SAMFileHeader)38 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)37 HashSet (java.util.HashSet)34 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)32 VcfIterator (com.github.lindenb.jvarkit.util.vcf.VcfIterator)30 List (java.util.List)30 VCFHeaderLine (htsjdk.variant.vcf.VCFHeaderLine)29 HashMap (java.util.HashMap)28 Parameter (com.beust.jcommander.Parameter)27 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)27