Search in sources :

Example 76 with Genotype

use of htsjdk.variant.variantcontext.Genotype in project jvarkit by lindenb.

the class VcfCalledWithAnotherMethod method doVcfToVcf.

protected int doVcfToVcf(final String inputName, final VcfIterator in, final VariantContextWriter out) {
    final List<ExternalVcf> externalVcfs = new ArrayList<>();
    try {
        final VCFHeader header = in.getHeader();
        this.dictionary = header.getSequenceDictionary();
        /**
         * open primitive input
         */
        if (this.dictionary == null) {
            LOG.error("no dictionary in input");
            return -1;
        }
        final Set<File> samtoolsFiles = new HashSet<>();
        this.externalVcfStrs.stream().filter(S -> S.endsWith(".list")).map(S -> Paths.get(S)).forEach(P -> {
            try {
                samtoolsFiles.addAll(Files.readAllLines(P).stream().filter(L -> !(L.startsWith("#") || L.trim().isEmpty())).map(S -> new File(S)).collect(Collectors.toSet()));
            } catch (final Exception err) {
                throw new RuntimeIOException(err);
            }
        });
        samtoolsFiles.addAll(this.externalVcfStrs.stream().filter(S -> !S.endsWith(".list")).map(S -> new File(S)).collect(Collectors.toSet()));
        externalVcfs.addAll(samtoolsFiles.stream().map(F -> new ExternalVcf(F)).collect(Collectors.toList()));
        /**
         * check for uniq keys
         */
        final Set<String> uniqKeys = new HashSet<>();
        for (final ExternalVcf extvcf : externalVcfs) {
            int n = 0;
            for (; ; ) {
                final String newkey = extvcf.key + (n == 0 ? "" : "_" + n);
                if (!uniqKeys.contains(newkey)) {
                    extvcf.key = newkey;
                    uniqKeys.add(newkey);
                    break;
                }
                ++n;
            }
        }
        final VCFHeader h2 = new VCFHeader(header);
        for (final ExternalVcf extvcf : externalVcfs) {
            h2.addMetaDataLine(new VCFHeaderLine("otherVcf:" + extvcf.key, extvcf.vcfFile.getPath()));
        }
        final VCFFilterHeaderLine variantNotFoundElsewhereFILTER = (filterNameVariantNotFoundElseWhere.isEmpty() ? null : new VCFFilterHeaderLine(filterNameVariantNotFoundElseWhere, "Variant Was not found in other VCFs: " + externalVcfs.stream().map(S -> S.vcfFile.getPath()).collect(Collectors.joining(", "))));
        if (variantNotFoundElsewhereFILTER != null) {
            h2.addMetaDataLine(variantNotFoundElsewhereFILTER);
        }
        final VCFInfoHeaderLine variantFoundElseWhereKeys = new VCFInfoHeaderLine(this.infoFoundKey, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Variant was found in the VCFs designed by those keys");
        h2.addMetaDataLine(variantFoundElseWhereKeys);
        final VCFInfoHeaderLine variantFoundElseWhereCount = new VCFInfoHeaderLine(this.infoFoundKey, 1, VCFHeaderLineType.Integer, "Number of times the Variant was found  in the VCFs");
        h2.addMetaDataLine(variantFoundElseWhereCount);
        final VCFFormatHeaderLine genotypeCountSame = new VCFFormatHeaderLine(this.formatCountSame, 1, VCFHeaderLineType.Integer, "Number of times the Genotype was found the same in other VCFs");
        h2.addMetaDataLine(genotypeCountSame);
        final VCFFormatHeaderLine genotypeCountDiscordant = new VCFFormatHeaderLine(this.formatCountDiscordant, 1, VCFHeaderLineType.Integer, "Number of times the Genotype was found dicordant in other VCFs");
        h2.addMetaDataLine(genotypeCountDiscordant);
        super.addMetaData(h2);
        final VariantContextWriter w = super.openVariantContextWriter(outputFile);
        w.writeHeader(h2);
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(this.dictionary);
        while (in.hasNext()) {
            final VariantContext ctx = progress.watch(in.next());
            final List<GenotypeCount> genotypeCounts = ctx.getGenotypes().stream().map(G -> new GenotypeCount(G)).collect(Collectors.toList());
            final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
            final Set<String> variantFoundInOtherVcfs = new HashSet<>();
            for (final ExternalVcf extvcf : externalVcfs) {
                final List<VariantContext> otherVariants = extvcf.get(ctx);
                if (otherVariants.stream().filter(CTX2 -> {
                    if (ctx.getAlternateAlleles().isEmpty())
                        return true;
                    for (final Allele a : ctx.getAlternateAlleles()) {
                        if (CTX2.hasAllele(a))
                            return true;
                    }
                    return false;
                }).findAny().isPresent()) {
                    variantFoundInOtherVcfs.add(extvcf.key);
                }
                for (final GenotypeCount gt : genotypeCounts) {
                    for (final VariantContext otherVariant : otherVariants) {
                        final Genotype otherGt = otherVariant.getGenotype(gt.gt.getSampleName());
                        if (otherGt == null)
                            continue;
                        if (gt.gt.sameGenotype(otherGt) || (this.noCallSameAsHomRef && ((gt.gt.isNoCall() && otherGt.isHomRef()) || (gt.gt.isHomRef() && otherGt.isNoCall())))) {
                            gt.countSame++;
                        } else {
                            gt.countDiscordant++;
                        }
                    }
                }
            }
            vcb.genotypes(genotypeCounts.stream().map(G -> {
                final GenotypeBuilder gb = new GenotypeBuilder(G.gt);
                gb.attribute(genotypeCountSame.getID(), G.countSame);
                gb.attribute(genotypeCountDiscordant.getID(), G.countDiscordant);
                return gb.make();
            }).collect(Collectors.toList()));
            vcb.attribute(variantFoundElseWhereCount.getID(), variantFoundInOtherVcfs.size());
            if (variantFoundInOtherVcfs.isEmpty()) {
                if (variantNotFoundElsewhereFILTER != null) {
                    vcb.filter(variantNotFoundElsewhereFILTER.getID());
                }
            } else {
                if (variantNotFoundElsewhereFILTER != null && !ctx.isFiltered()) {
                    vcb.passFilters();
                }
                vcb.attribute(variantFoundElseWhereKeys.getID(), new ArrayList<>(variantFoundInOtherVcfs));
            }
            w.add(vcb.make());
        }
        progress.finish();
        while (!externalVcfs.isEmpty()) externalVcfs.remove(0).close();
        return RETURN_OK;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        while (!externalVcfs.isEmpty()) externalVcfs.remove(0).close();
    }
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) SequenceUtil(htsjdk.samtools.util.SequenceUtil) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFFileReader(htsjdk.variant.vcf.VCFFileReader) VCFHeader(htsjdk.variant.vcf.VCFHeader) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Files(java.nio.file.Files) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) File(java.io.File) List(java.util.List) Paths(java.nio.file.Paths) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) PeekIterator(htsjdk.samtools.util.PeekIterator) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) Comparator(java.util.Comparator) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) File(java.io.File)

Example 77 with Genotype

use of htsjdk.variant.variantcontext.Genotype in project jvarkit by lindenb.

the class VcfCutSamples method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
    VCFHeader header = in.getHeader();
    final Set<String> samples1 = new HashSet<String>(header.getSampleNamesInOrder());
    for (String my : this.getUserSamples()) {
        if (!samples1.contains(my)) {
            String msg = "user sample " + my + " is not present in VCF Header : " + samples1;
            if (this.missing_sample_is_error) {
                throw new RuntimeException(msg);
            } else {
                LOG.warning(msg);
            }
        }
    }
    final List<String> samples2 = new ArrayList<String>();
    for (final String sample : header.getSampleNamesInOrder()) {
        if (this.getUserSamples().contains(sample)) {
            if (!invert) {
                samples2.add(sample);
            }
        } else {
            if (invert) {
                samples2.add(sample);
            }
        }
    }
    final VCFHeader header2 = new VCFHeader(header.getMetaDataInInputOrder(), samples2);
    header2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
    header2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
    header2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkVersion", HtsjdkVersion.getVersion()));
    header2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkHome", HtsjdkVersion.getHome()));
    this.recalculator.setHeader(header2);
    out.writeHeader(header2);
    final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
    while (in.hasNext()) {
        final VariantContext ctx = progress.watch(in.next());
        final VariantContextBuilder vb = new VariantContextBuilder(ctx);
        final List<Genotype> genotypes = new ArrayList<Genotype>();
        final Set<Allele> alleles = new HashSet<Allele>();
        boolean only_no_call = true;
        for (final String sample : samples2) {
            final Genotype g = ctx.getGenotype(sample);
            if (g.isNoCall() || !g.isCalled())
                continue;
            alleles.addAll(g.getAlleles());
            genotypes.add(g);
            if (g.isCalled())
                only_no_call = false;
        }
        if (removeCtxIfNoCall && only_no_call)
            continue;
        alleles.add(ctx.getReference());
        vb.alleles(alleles);
        vb.genotypes(genotypes);
        out.add(this.recalculator.apply(vb.make()));
    }
    progress.finish();
    return 0;
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) Genotype(htsjdk.variant.variantcontext.Genotype) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet)

Example 78 with Genotype

use of htsjdk.variant.variantcontext.Genotype in project jvarkit by lindenb.

the class IndexCovToVcf method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.deletionTreshold >= 1.0f) {
        LOG.error("Bad deletion treshold >=1.0");
        return -1;
    }
    if (this.duplicationTreshold <= 1.0f) {
        LOG.error("Bad duplication treshold <=1.0");
        return -1;
    }
    if (this.deletionTreshold >= this.duplicationTreshold) {
        LOG.error("Bad tresholds del>=dup");
        return -1;
    }
    final Pattern tab = Pattern.compile("[\t]");
    BufferedReader r = null;
    VariantContextWriter vcw = null;
    try {
        final SAMSequenceDictionary dict;
        if (this.refFile == null) {
            dict = null;
        } else {
            dict = SAMSequenceDictionaryExtractor.extractDictionary(this.refFile);
            if (dict == null) {
                LOG.error("Cannot find dict in " + this.refFile);
                return -1;
            }
        }
        r = super.openBufferedReader(oneFileOrNull(args));
        String line = r.readLine();
        if (line == null) {
            LOG.error("Cannot read first line of input");
            return -1;
        }
        String[] tokens = tab.split(line);
        if (tokens.length < 4 || !tokens[0].equals("#chrom") || !tokens[1].equals("start") || !tokens[2].equals("end")) {
            LOG.error("bad first line " + line);
            return -1;
        }
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        VCFStandardHeaderLines.addStandardFormatLines(metaData, true, "GT");
        VCFStandardHeaderLines.addStandardInfoLines(metaData, true, "END");
        final VCFFormatHeaderLine foldHeader = new VCFFormatHeaderLine("F", 1, VCFHeaderLineType.Float, "Relative number of copy: 0.5 deletion 1 normal 2.0 duplication");
        metaData.add(foldHeader);
        final VCFFormatHeaderLine formatIsDeletion = new VCFFormatHeaderLine("DEL", 1, VCFHeaderLineType.Integer, "set to 1 if relative number of copy <= " + this.deletionTreshold);
        metaData.add(formatIsDeletion);
        final VCFFormatHeaderLine formatIsDuplication = new VCFFormatHeaderLine("DUP", 1, VCFHeaderLineType.Integer, "set to 1 if relative number of copy >= " + this.duplicationTreshold);
        metaData.add(formatIsDuplication);
        final VCFFilterHeaderLine filterAllDel = new VCFFilterHeaderLine("ALL_DEL", "number of samples greater than 1 and all are deletions");
        metaData.add(filterAllDel);
        final VCFFilterHeaderLine filterAllDup = new VCFFilterHeaderLine("ALL_DUP", "number of samples  greater than  1 and all are duplication");
        metaData.add(filterAllDup);
        final VCFFilterHeaderLine filterNoSV = new VCFFilterHeaderLine("NO_SV", "There is no DUP or DEL in this variant");
        metaData.add(filterNoSV);
        final VCFInfoHeaderLine infoNumDup = new VCFInfoHeaderLine("NDUP", 1, VCFHeaderLineType.Integer, "Number of samples being duplicated");
        metaData.add(infoNumDup);
        final VCFInfoHeaderLine infoNumDel = new VCFInfoHeaderLine("NDEL", 1, VCFHeaderLineType.Integer, "Number of samples being deleted");
        metaData.add(infoNumDel);
        final List<String> samples = Arrays.asList(tokens).subList(3, tokens.length);
        final VCFHeader vcfHeader = new VCFHeader(metaData, samples);
        if (dict != null) {
            vcfHeader.setSequenceDictionary(dict);
        }
        vcw = super.openVariantContextWriter(outputFile);
        vcw.writeHeader(vcfHeader);
        // final List<Allele> NO_CALL_NO_CALL = Arrays.asList(Allele.NO_CALL,Allele.NO_CALL);
        final Allele DUP_ALLELE = Allele.create("<DUP>", false);
        final Allele DEL_ALLELE = Allele.create("<DEL>", false);
        final Allele REF_ALLELE = Allele.create("N", true);
        while ((line = r.readLine()) != null) {
            if (StringUtil.isBlank(line))
                continue;
            tokens = tab.split(line);
            if (tokens.length != 3 + samples.size()) {
                throw new JvarkitException.TokenErrors("expected " + (samples.size() + 3) + "columns.", tokens);
            }
            final Set<Allele> alleles = new HashSet<>();
            alleles.add(REF_ALLELE);
            final VariantContextBuilder vcb = new VariantContextBuilder();
            vcb.chr(tokens[0]);
            vcb.start(Integer.parseInt(tokens[1]));
            final int chromEnd = Integer.parseInt(tokens[2]);
            vcb.stop(chromEnd);
            vcb.attribute(VCFConstants.END_KEY, chromEnd);
            if (dict != null) {
                final SAMSequenceRecord ssr = dict.getSequence(tokens[0]);
                if (ssr == null) {
                    LOG.error(JvarkitException.ContigNotFoundInDictionary.getMessage(tokens[0], dict));
                    return -1;
                }
                if (chromEnd > ssr.getSequenceLength()) {
                    LOG.warn("WARNING sequence length in " + line + " is greater than in dictionary ");
                }
            }
            int count_dup = 0;
            int count_del = 0;
            final List<Genotype> genotypes = new ArrayList<>(samples.size());
            for (int i = 3; i < tokens.length; i++) {
                final float f = Float.parseFloat(tokens[i]);
                if (f < 0 || Float.isNaN(f) || !Float.isFinite(f)) {
                    LOG.error("Bad fold " + f + " in " + line);
                }
                final GenotypeBuilder gb;
                if (f <= this.deletionTreshold) {
                    gb = new GenotypeBuilder(samples.get(i - 3), Collections.singletonList(DEL_ALLELE));
                    alleles.add(DEL_ALLELE);
                    gb.attribute(formatIsDeletion.getID(), 1);
                    count_del++;
                } else if (f >= this.duplicationTreshold) {
                    gb = new GenotypeBuilder(samples.get(i - 3), Collections.singletonList(DUP_ALLELE));
                    alleles.add(DUP_ALLELE);
                    gb.attribute(formatIsDuplication.getID(), 1);
                    count_dup++;
                } else {
                    gb = new GenotypeBuilder(samples.get(i - 3), Collections.singletonList(REF_ALLELE));
                    gb.attribute(formatIsDuplication.getID(), 0);
                }
                gb.attribute(foldHeader.getID(), f);
                genotypes.add(gb.make());
            }
            vcb.alleles(alleles);
            vcb.genotypes(genotypes);
            if (count_dup == samples.size() && samples.size() != 1) {
                vcb.filter(filterAllDup.getID());
            }
            if (count_del == samples.size() && samples.size() != 1) {
                vcb.filter(filterAllDel.getID());
            }
            if (count_dup == 0 && count_del == 0) {
                vcb.filter(filterNoSV.getID());
            }
            vcb.attribute(infoNumDel.getID(), count_del);
            vcb.attribute(infoNumDup.getID(), count_dup);
            vcw.add(vcb.make());
        }
        vcw.close();
        vcw = null;
        r.close();
        r = null;
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(r);
        CloserUtil.close(vcw);
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ArrayList(java.util.ArrayList) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Pattern(java.util.regex.Pattern) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) BufferedReader(java.io.BufferedReader)

Example 79 with Genotype

use of htsjdk.variant.variantcontext.Genotype in project jvarkit by lindenb.

the class SamFindClippedRegions method doWork.

/*private static boolean closeTo(int pos1,int pos2, int max)
		{
		return Math.abs(pos2-pos1)<=max;
		}*/
/*
	private static boolean same(char c1,char c2)
		{
		if(c1=='N' || c2=='N') return false;
		return Character.toUpperCase(c1)==Character.toUpperCase(c2);
		}*/
@Override
public int doWork(List<String> args) {
    int readLength = 150;
    if (args.isEmpty()) {
        LOG.error("illegal.number.of.arguments");
        return -1;
    }
    List<Input> inputs = new ArrayList<Input>();
    VariantContextWriter w = null;
    // SAMFileWriter w=null;
    try {
        SAMSequenceDictionary dict = null;
        /* create input, collect sample names */
        Map<String, Input> sample2input = new HashMap<String, Input>();
        for (final String filename : args) {
            Input input = new Input(new File(filename));
            // input.index=inputs.size();
            inputs.add(input);
            if (sample2input.containsKey(input.sampleName)) {
                LOG.error("Duplicate sample " + input.sampleName + " in " + input.bamFile + " and " + sample2input.get(input.sampleName).bamFile);
                return -1;
            }
            sample2input.put(input.sampleName, input);
            if (dict == null) {
                dict = input.header.getSequenceDictionary();
            } else if (!SequenceUtil.areSequenceDictionariesEqual(dict, input.header.getSequenceDictionary())) {
                LOG.error("Found more than one dictint sequence dictionary");
                return -1;
            }
        }
        LOG.info("Sample N= " + sample2input.size());
        /* create merged iterator */
        List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(sample2input.size());
        for (Input input : inputs) headers.add(input.header);
        SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SortOrder.coordinate, headers, true);
        List<SamReader> readers = new ArrayList<SamReader>(sample2input.size());
        for (Input input : inputs) readers.add(input.samFileReaderScan);
        MergingSamRecordIterator merginIter = new MergingSamRecordIterator(headerMerger, readers, true);
        Allele reference_allele = Allele.create("N", true);
        Allele[] alternate_alleles = new Allele[] { Allele.create("<CLIP5>", false), Allele.create("<CLIP3>", false) };
        Set<VCFHeaderLine> vcfHeaderLines = new HashSet<VCFHeaderLine>();
        for (Allele alt : alternate_alleles) {
            vcfHeaderLines.add(new VCFSimpleHeaderLine("<ID=" + alt.getDisplayString() + ",Description=\"StructVar\">", VCFHeaderVersion.VCF4_1, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description")));
        }
        vcfHeaderLines.add(new VCFInfoHeaderLine("COUNT_SAMPLES", 1, VCFHeaderLineType.Integer, "Number of samples with  depth>=" + this.min_depth));
        vcfHeaderLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth."));
        vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
        vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth"));
        vcfHeaderLines.add(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
        vcfHeaderLines.add(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
        for (int side = 0; side < 2; ++side) {
            vcfHeaderLines.add(new VCFFormatHeaderLine("CN" + (side == 0 ? 5 : 3), 1, VCFHeaderLineType.Integer, "count clipped in " + (side == 0 ? 5 : 3) + "'"));
        }
        if (dict != null) {
            vcfHeaderLines.addAll(VCFUtils.samSequenceDictToVCFContigHeaderLine(dict));
        }
        VCFHeader vcfHeader = new VCFHeader(vcfHeaderLines, sample2input.keySet());
        w = VCFUtils.createVariantContextWriterToStdout();
        w.writeHeader(vcfHeader);
        final IntervalTreeMap<Boolean> intervals = new IntervalTreeMap<>();
        // w=swf.make(header, System.out);
        SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
        if (bedFile != null) {
            final BedLineCodec bedLineCodec = new BedLineCodec();
            LOG.info("Reading " + bedFile);
            BufferedReader r = IOUtils.openFileForBufferedReading(bedFile);
            String line;
            while ((line = r.readLine()) != null) {
                BedLine bedLine = bedLineCodec.decode(line);
                if (bedLine == null)
                    continue;
                if (dict != null && dict.getSequence(bedLine.getContig()) == null) {
                    LOG.warning("undefined chromosome  in " + bedFile + " " + line);
                    continue;
                }
                intervals.put(bedLine.toInterval(), true);
            }
            CloserUtil.close(r);
        }
        LinkedList<SAMRecord> buffer = new LinkedList<SAMRecord>();
        final Predicate<SAMRecord> filterSamRecords = new Predicate<SAMRecord>() {

            @Override
            public boolean test(SAMRecord rec) {
                if (rec.getReadUnmappedFlag())
                    return false;
                if (rec.isSecondaryOrSupplementary())
                    return false;
                if (rec.getDuplicateReadFlag())
                    return false;
                if (rec.getReadFailsVendorQualityCheckFlag())
                    return false;
                Cigar cigar = rec.getCigar();
                if (cigar == null || cigar.numCigarElements() < 2)
                    return false;
                boolean found_S = false;
                for (int side = 0; side < 2; ++side) {
                    CigarElement ce = cigar.getCigarElement(side == 0 ? 0 : cigar.numCigarElements() - 1);
                    // read must be clipped on 5' or 3' with a good length
                    if (!ce.getOperator().equals(CigarOperator.S))
                        continue;
                    found_S = true;
                    break;
                }
                if (!found_S)
                    return false;
                SAMReadGroupRecord g = rec.getReadGroup();
                if (g == null || g.getSample() == null || g.getSample().isEmpty())
                    return false;
                return true;
            }
        };
        final FilteringIterator<SAMRecord> forwardIterator = new FilteringIterator<SAMRecord>(merginIter, filterSamRecords);
        for (; ; ) {
            SAMRecord rec = null;
            if (forwardIterator.hasNext()) {
                rec = forwardIterator.next();
                progress.watch(rec);
                if (intervals != null && !intervals.containsOverlapping(new Interval(rec.getReferenceName(), rec.getAlignmentStart(), rec.getAlignmentEnd())))
                    continue;
            }
            // need to flush buffer ?
            if (rec == null || (!buffer.isEmpty() && !buffer.getLast().getReferenceIndex().equals(rec.getReferenceIndex())) || (!buffer.isEmpty() && buffer.getLast().getUnclippedEnd() + readLength < rec.getUnclippedStart())) {
                if (!buffer.isEmpty()) {
                    int chromStart = buffer.getFirst().getUnclippedStart();
                    int chromEnd = buffer.getFirst().getUnclippedEnd();
                    for (SAMRecord sam : buffer) {
                        chromStart = Math.min(chromStart, sam.getUnclippedStart());
                        chromEnd = Math.max(chromEnd, sam.getUnclippedEnd());
                    }
                    final int winShift = 5;
                    for (int pos = chromStart; pos + winShift <= chromEnd; pos += winShift) {
                        int[] count_big_clip = new int[] { 0, 0 };
                        // int max_depth[]=new int[]{0,0};
                        List<Genotype> genotypes = new ArrayList<Genotype>();
                        Set<Allele> all_alleles = new HashSet<Allele>();
                        all_alleles.add(reference_allele);
                        boolean found_one_depth_ok = false;
                        int sum_depth = 0;
                        int samples_with_high_depth = 0;
                        for (String sample : sample2input.keySet()) {
                            GenotypeBuilder gb = new GenotypeBuilder(sample);
                            int[] count_clipped = new int[] { 0, 0 };
                            Set<Allele> sample_alleles = new HashSet<Allele>(3);
                            for (int side = 0; side < 2; ++side) {
                                for (SAMRecord sam : buffer) {
                                    if (!sam.getReadGroup().getSample().equals(sample))
                                        continue;
                                    Cigar cigar = sam.getCigar();
                                    CigarElement ce = cigar.getCigarElement(side == 0 ? 0 : cigar.numCigarElements() - 1);
                                    if (!ce.getOperator().equals(CigarOperator.S))
                                        continue;
                                    int clipStart = (side == 0 ? sam.getUnclippedStart() : sam.getAlignmentEnd() + 1);
                                    int clipEnd = (side == 0 ? sam.getAlignmentStart() - 1 : sam.getUnclippedEnd());
                                    if ((pos + winShift < clipStart || pos > clipEnd))
                                        continue;
                                    count_clipped[side]++;
                                    if (ce.getLength() >= this.min_clip_length) {
                                        count_big_clip[side]++;
                                    }
                                    sample_alleles.add(alternate_alleles[side]);
                                    gb.attribute("CN" + (side == 0 ? 5 : 3), count_clipped[side]);
                                }
                            }
                            // if(!(found_one_big_clip[0] || found_one_big_clip[1])) continue;
                            if (count_clipped[0] + count_clipped[1] == 0)
                                continue;
                            if ((count_clipped[0] + count_clipped[1]) > min_depth) {
                                found_one_depth_ok = true;
                                ++samples_with_high_depth;
                            }
                            sum_depth += (count_clipped[0] + count_clipped[1]);
                            gb.alleles(new ArrayList<Allele>(sample_alleles));
                            all_alleles.addAll(sample_alleles);
                            gb.DP(count_clipped[0] + count_clipped[1]);
                            genotypes.add(gb.make());
                        }
                        if (all_alleles.size() == 1) {
                            // all homozygotes
                            continue;
                        }
                        if (!found_one_depth_ok) {
                            continue;
                        }
                        if (!(count_big_clip[0] >= 1 || count_big_clip[1] >= 1)) {
                            continue;
                        }
                        Map<String, Object> atts = new HashMap<String, Object>();
                        atts.put("COUNT_SAMPLES", samples_with_high_depth);
                        atts.put(VCFConstants.DEPTH_KEY, sum_depth);
                        VariantContextBuilder vcb = new VariantContextBuilder();
                        vcb.chr(buffer.getFirst().getReferenceName());
                        vcb.start(pos);
                        vcb.stop(pos + winShift);
                        vcb.alleles(all_alleles);
                        vcb.attributes(atts);
                        vcb.genotypes(genotypes);
                        w.add(vcb.make());
                    }
                    buffer.clear();
                }
                if (rec == null) {
                    break;
                }
            }
            buffer.add(rec);
        }
        merginIter.close();
        progress.finish();
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        for (Input input : inputs) {
            CloserUtil.close(input);
        }
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) VCFSimpleHeaderLine(htsjdk.variant.vcf.VCFSimpleHeaderLine) Predicate(java.util.function.Predicate) HashSet(java.util.HashSet) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) CigarElement(htsjdk.samtools.CigarElement) LinkedList(java.util.LinkedList) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) File(java.io.File) Interval(htsjdk.samtools.util.Interval) MergingSamRecordIterator(htsjdk.samtools.MergingSamRecordIterator) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SamReader(htsjdk.samtools.SamReader) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) SamFileHeaderMerger(htsjdk.samtools.SamFileHeaderMerger) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) IOException(java.io.IOException) Allele(htsjdk.variant.variantcontext.Allele) Cigar(htsjdk.samtools.Cigar) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) BufferedReader(java.io.BufferedReader) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap)

Example 80 with Genotype

use of htsjdk.variant.variantcontext.Genotype in project jvarkit by lindenb.

the class VcfSimulator method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.indexedFastaSequenceFile == null) {
        LOG.error("Reference is undefined");
        return -1;
    }
    if (!args.isEmpty()) {
        LOG.error("too many arguments");
        return -1;
    }
    if (this.randomSeed == -1L) {
        this.random = new Random(System.currentTimeMillis());
    } else {
        this.random = new Random(this.randomSeed);
    }
    if (this.numSamples < 0) {
        this.numSamples = 1 + this.random.nextInt(10);
    }
    while (this.samples.size() < numSamples) this.samples.add("SAMPLE" + (1 + this.samples.size()));
    VariantContextWriter writer = null;
    PrintStream pw = null;
    try {
        final Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
        VCFStandardHeaderLines.addStandardFormatLines(metaData, true, "GT", "DP");
        VCFStandardHeaderLines.addStandardInfoLines(metaData, true, "AF", "AN", "AC", "DP");
        VariantAttributesRecalculator calc = new VariantAttributesRecalculator();
        final VCFHeader header = new VCFHeader(metaData, this.samples);
        header.setSequenceDictionary(this.indexedFastaSequenceFile.getSequenceDictionary());
        calc.setHeader(header);
        pw = super.openFileOrStdoutAsPrintStream(this.outputFile);
        writer = VCFUtils.createVariantContextWriterToOutputStream(pw);
        writer.writeHeader(header);
        long countVariantsSoFar = 0;
        for (; ; ) {
            if (pw.checkError())
                break;
            if (this.numberOfVariants >= 0 && countVariantsSoFar >= this.numberOfVariants)
                break;
            for (final SAMSequenceRecord ssr : this.indexedFastaSequenceFile.getSequenceDictionary().getSequences()) {
                if (pw.checkError())
                    break;
                final GenomicSequence genomic = new GenomicSequence(this.indexedFastaSequenceFile, ssr.getSequenceName());
                for (int pos = 1; pos <= ssr.getSequenceLength(); ++pos) {
                    if (pw.checkError())
                        break;
                    char REF = Character.toUpperCase(genomic.charAt(pos - 1));
                    if (REF == 'N')
                        continue;
                    char ALT = 'N';
                    switch(REF) {
                        case 'A':
                            ALT = "TGC".charAt(random.nextInt(3));
                            break;
                        case 'T':
                            ALT = "AGC".charAt(random.nextInt(3));
                            break;
                        case 'G':
                            ALT = "TAC".charAt(random.nextInt(3));
                            break;
                        case 'C':
                            ALT = "TGA".charAt(random.nextInt(3));
                            break;
                        default:
                            ALT = 'N';
                    }
                    if (ALT == 'N')
                        continue;
                    final Allele refAllele = Allele.create((byte) REF, true);
                    Allele altAllele = Allele.create((byte) ALT, false);
                    final VariantContextBuilder cb = new VariantContextBuilder();
                    cb.chr(genomic.getChrom());
                    cb.start(pos);
                    cb.stop(pos);
                    List<Genotype> genotypes = new ArrayList<Genotype>(samples.size());
                    for (String sample : samples) {
                        final Allele a1 = (random.nextBoolean() ? refAllele : altAllele);
                        final Allele a2 = (random.nextBoolean() ? refAllele : altAllele);
                        GenotypeBuilder gb = new GenotypeBuilder(sample, Arrays.asList(a1, a2));
                        if (random.nextBoolean()) {
                            gb = new GenotypeBuilder(sample, Arrays.asList(a1, a2));
                            gb.DP(1 + random.nextInt(50));
                        }
                        genotypes.add(gb.make());
                    }
                    cb.genotypes(genotypes);
                    cb.alleles(Arrays.asList(refAllele, altAllele));
                    writer.add(calc.apply(cb.make()));
                    countVariantsSoFar++;
                }
            }
        }
        writer.close();
        writer = null;
        pw.flush();
        pw.close();
        pw = null;
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(this.indexedFastaSequenceFile);
        CloserUtil.close(pw);
        CloserUtil.close(writer);
    }
}
Also used : PrintStream(java.io.PrintStream) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) ArrayList(java.util.ArrayList) Genotype(htsjdk.variant.variantcontext.Genotype) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Allele(htsjdk.variant.variantcontext.Allele) Random(java.util.Random) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VariantAttributesRecalculator(com.github.lindenb.jvarkit.util.vcf.VariantAttributesRecalculator) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet)

Aggregations

Genotype (htsjdk.variant.variantcontext.Genotype)150 VariantContext (htsjdk.variant.variantcontext.VariantContext)97 Allele (htsjdk.variant.variantcontext.Allele)82 ArrayList (java.util.ArrayList)54 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)52 GenotypeBuilder (htsjdk.variant.variantcontext.GenotypeBuilder)51 File (java.io.File)48 VCFHeader (htsjdk.variant.vcf.VCFHeader)46 IOException (java.io.IOException)45 Collectors (java.util.stream.Collectors)42 Test (org.testng.annotations.Test)37 HashSet (java.util.HashSet)35 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)29 List (java.util.List)29 VCFHeaderLine (htsjdk.variant.vcf.VCFHeaderLine)27 VCFFormatHeaderLine (htsjdk.variant.vcf.VCFFormatHeaderLine)25 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)23 HashMap (java.util.HashMap)23 java.util (java.util)22 Set (java.util.Set)22