use of htsjdk.variant.variantcontext.GenotypeBuilder in project gatk by broadinstitute.
the class DepthPerAlleleBySample method annotate.
@Override
public void annotate(final ReferenceContext ref, final VariantContext vc, final Genotype g, final GenotypeBuilder gb, final ReadLikelihoods<Allele> likelihoods) {
Utils.nonNull(gb, "gb is null");
Utils.nonNull(vc, "vc is null");
if (g == null || !g.isCalled() || likelihoods == null) {
return;
}
final Set<Allele> alleles = new LinkedHashSet<>(vc.getAlleles());
// make sure that there's a meaningful relationship between the alleles in the likelihoods and our VariantContext
Utils.validateArg(likelihoods.alleles().containsAll(alleles), () -> "VC alleles " + alleles + " not a subset of ReadLikelihoods alleles " + likelihoods.alleles());
final Map<Allele, Integer> alleleCounts = new LinkedHashMap<>();
for (final Allele allele : vc.getAlleles()) {
alleleCounts.put(allele, 0);
}
final Map<Allele, List<Allele>> alleleSubset = alleles.stream().collect(Collectors.toMap(a -> a, Arrays::asList));
final ReadLikelihoods<Allele> subsettedLikelihoods = likelihoods.marginalize(alleleSubset);
subsettedLikelihoods.bestAlleles(g.getSampleName()).stream().filter(ba -> ba.isInformative()).forEach(ba -> alleleCounts.compute(ba.allele, (allele, prevCount) -> prevCount + 1));
final int[] counts = new int[alleleCounts.size()];
//first one in AD is always ref
counts[0] = alleleCounts.get(vc.getReference());
for (int i = 0; i < vc.getAlternateAlleles().size(); i++) {
counts[i + 1] = alleleCounts.get(vc.getAlternateAllele(i));
}
gb.AD(counts);
}
use of htsjdk.variant.variantcontext.GenotypeBuilder in project gatk-protected by broadinstitute.
the class StrandArtifact method annotate.
@Override
public void annotate(final ReferenceContext ref, final VariantContext vc, final Genotype g, final GenotypeBuilder gb, final ReadLikelihoods<Allele> likelihoods) {
Utils.nonNull(gb);
Utils.nonNull(vc);
Utils.nonNull(likelihoods);
// do not annotate the genotype fields for normal
if (g.isHomRef()) {
return;
}
pi.put(NO_ARTIFACT, 0.95);
pi.put(ART_FWD, 0.025);
pi.put(ART_REV, 0.025);
// We use the allele with highest LOD score
final double[] tumorLods = GATKProtectedVariantContextUtils.getAttributeAsDoubleArray(vc, GATKVCFConstants.TUMOR_LOD_KEY, () -> null, -1);
final int indexOfMaxTumorLod = MathUtils.maxElementIndex(tumorLods);
final Allele altAlelle = vc.getAlternateAllele(indexOfMaxTumorLod);
final Collection<ReadLikelihoods<Allele>.BestAllele<Allele>> bestAlleles = likelihoods.bestAlleles(g.getSampleName());
final int numFwdAltReads = (int) bestAlleles.stream().filter(ba -> !ba.read.isReverseStrand() && ba.isInformative() && ba.allele.equals(altAlelle)).count();
final int numRevAltReads = (int) bestAlleles.stream().filter(ba -> ba.read.isReverseStrand() && ba.isInformative() && ba.allele.equals(altAlelle)).count();
final int numFwdReads = (int) bestAlleles.stream().filter(ba -> !ba.read.isReverseStrand() && ba.isInformative()).count();
final int numRevReads = (int) bestAlleles.stream().filter(ba -> ba.read.isReverseStrand() && ba.isInformative()).count();
final int numAltReads = numFwdAltReads + numRevAltReads;
final int numReads = numFwdReads + numRevReads;
final EnumMap<StrandArtifactZ, Double> unnormalized_posterior_probabilities = new EnumMap<>(StrandArtifactZ.class);
final EnumMap<StrandArtifactZ, Double> maximum_a_posteriori_allele_fraction_estimates = new EnumMap<>(StrandArtifactZ.class);
/*** Compute the posterior probability of ARTIFACT_FWD and ARTIFACT_REV; it's a double integral over f and epsilon ***/
// the integrand is a polynomial of degree n, where n is the number of reads at the locus
// thus to integrate exactly with Gauss-Legendre we need (n/2)+1 points
final int numIntegPointsForAlleleFraction = numReads / 2 + 1;
final int numIntegPointsForEpsilon = (numReads + ALPHA + BETA - 2) / 2 + 1;
final double likelihoodForArtifactFwd = IntegrationUtils.integrate2d((f, epsilon) -> getIntegrandGivenArtifact(f, epsilon, numFwdReads, numRevReads, numFwdAltReads, numRevAltReads), 0.0, 1.0, numIntegPointsForAlleleFraction, 0.0, 1.0, numIntegPointsForEpsilon);
final double likelihoodForArtifactRev = IntegrationUtils.integrate2d((f, epsilon) -> getIntegrandGivenArtifact(f, epsilon, numRevReads, numFwdReads, numRevAltReads, numFwdAltReads), 0.0, 1.0, numIntegPointsForAlleleFraction, 0.0, 1.0, numIntegPointsForEpsilon);
unnormalized_posterior_probabilities.put(ART_FWD, pi.get(ART_FWD) * likelihoodForArtifactFwd);
unnormalized_posterior_probabilities.put(ART_REV, pi.get(ART_REV) * likelihoodForArtifactRev);
/*** Compute the posterior probability of NO_ARTIFACT; evaluate a single integral over the allele fraction ***/
final double likelihoodForNoArtifact = IntegrationUtils.integrate(f -> getIntegrandGivenNoArtifact(f, numFwdReads, numRevReads, numFwdAltReads, numRevAltReads), 0.0, 1.0, numIntegPointsForAlleleFraction);
unnormalized_posterior_probabilities.put(NO_ARTIFACT, pi.get(NO_ARTIFACT) * likelihoodForNoArtifact);
final double[] posterior_probabilities = MathUtils.normalizeFromRealSpace(unnormalized_posterior_probabilities.values().stream().mapToDouble(Double::doubleValue).toArray());
/*** Compute the maximum a posteriori estimate for allele fraction given strand artifact ***/
// For a fixed f, integrate the double integral over epsilons. This gives us the likelihood p(x^+, x^- | f, z) for a fixed f, which is proportional to
// the posterior probability of p(f | x^+, x^-, z)
final int numSamplePoints = 100;
final double[] samplePoints = GATKProtectedMathUtils.createEvenlySpacedPoints(0.0, 1.0, numSamplePoints);
double[] likelihoodsGivenForwardArtifact = new double[numSamplePoints];
double[] likelihoodsGivenReverseArtifact = new double[numSamplePoints];
for (int i = 0; i < samplePoints.length; i++) {
final double f = samplePoints[i];
likelihoodsGivenForwardArtifact[i] = IntegrationUtils.integrate(epsilon -> getIntegrandGivenArtifact(f, epsilon, numFwdReads, numRevReads, numFwdAltReads, numRevAltReads), 0.0, 1.0, numIntegPointsForEpsilon);
likelihoodsGivenReverseArtifact[i] = IntegrationUtils.integrate(epsilon -> getIntegrandGivenArtifact(f, epsilon, numRevReads, numFwdReads, numRevAltReads, numFwdAltReads), 0.0, 1.0, numIntegPointsForEpsilon);
}
final int maxAlleleFractionIndexFwd = MathUtils.maxElementIndex(likelihoodsGivenForwardArtifact);
final int maxAlleleFractionIndexRev = MathUtils.maxElementIndex(likelihoodsGivenReverseArtifact);
maximum_a_posteriori_allele_fraction_estimates.put(ART_FWD, samplePoints[maxAlleleFractionIndexFwd]);
maximum_a_posteriori_allele_fraction_estimates.put(ART_REV, samplePoints[maxAlleleFractionIndexRev]);
// In the absence of strand artifact, MAP estimate for f reduces to the sample alt allele fraction
maximum_a_posteriori_allele_fraction_estimates.put(NO_ARTIFACT, (double) numAltReads / numReads);
gb.attribute(POSTERIOR_PROBABILITIES_KEY, posterior_probabilities);
gb.attribute(MAP_ALLELE_FRACTIONS_KEY, maximum_a_posteriori_allele_fraction_estimates.values().stream().mapToDouble(Double::doubleValue).toArray());
}
use of htsjdk.variant.variantcontext.GenotypeBuilder in project jvarkit by lindenb.
the class FastGenotypeGVCFs method doWork.
@Override
public int doWork(final List<String> args) {
VariantContextWriter w = null;
try {
this.gvcfSources.addAll(IOUtil.unrollFiles(args.stream().map(S -> new File(S)).collect(Collectors.toSet()), ".g.vcf", ".g.vcf.gz").stream().map(F -> new Source(F)).collect(Collectors.toList()));
if (args.isEmpty()) {
LOG.error("No gvcf file was given");
return -1;
}
this.gvcfSources.stream().forEach(S -> S.open());
this.dictionary = this.gvcfSources.get(0).vcfFileReader.getFileHeader().getSequenceDictionary();
if (this.dictionary == null) {
LOG.error("Dict missing in " + this.gvcfSources.get(0).gvcfFile);
return -1;
}
this.gvcfSources.stream().map(S -> S.vcfFileReader.getFileHeader().getSequenceDictionary()).forEach(D -> {
if (D == null || !SequenceUtil.areSequenceDictionariesEqual(D, dictionary)) {
throw new JvarkitException.UserError("dict missing or dict are not the same");
}
});
if (gvcfSources.stream().map(S -> S.sampleName).collect(Collectors.toSet()).stream().count() != this.gvcfSources.size()) {
LOG.error("Duplicate sample name. check input");
return -1;
}
final Set<VCFHeaderLine> metaData = new HashSet<>();
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_KEY, VCFConstants.GENOTYPE_ALLELE_DEPTHS, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_QUALITY_KEY, VCFConstants.GENOTYPE_PL_KEY);
metaData.addAll(gvcfSources.stream().flatMap(S -> S.vcfFileReader.getFileHeader().getFormatHeaderLines().stream()).collect(Collectors.toSet()));
final VCFHeader header = new VCFHeader(metaData, gvcfSources.stream().map(S -> S.sampleName).collect(Collectors.toList()));
w = super.openVariantContextWriter(outputFile);
w.writeHeader(header);
int contigTid = 0;
while (contigTid < dictionary.size()) {
final SAMSequenceRecord ssr = this.dictionary.getSequence(contigTid);
int pos = 0;
while (pos < ssr.getSequenceLength()) {
// LOG.debug(ssr.getSequenceName()+" "+pos+" "+this.gvcfSources.size());
GvcfVariant variantAtThisPos = null;
int minEnd = ssr.getSequenceLength();
// cleanup
for (final Source src : this.gvcfSources) {
for (; ; ) {
final GvcfThing gvcfthing = src.get();
// LOG.debug(""+gvcfthing+" "+src.sampleName+" "+ssr.getSequenceName()+":"+pos);
if (gvcfthing == null) {
// no more variant avaialble
break;
} else if (contigTid > gvcfthing.getTid()) {
// observed contig is after gvcfthing.contig
src.current = null;
continue;
} else if (contigTid < gvcfthing.getTid()) {
// observed contig is before gvcfthing.contig
break;
} else if (gvcfthing.getEnd() < pos) {
// variant information is before observed pos
src.current = null;
continue;
} else if (gvcfthing.getStart() > pos) {
// variant information is after observed pos
minEnd = Math.min(minEnd, gvcfthing.getStart() - 1);
break;
} else if (gvcfthing.isVariant()) {
if (variantAtThisPos == null || variantContigPosRefComparator.compare(GvcfVariant.class.cast(gvcfthing).ctx, variantAtThisPos.ctx) < 0) {
variantAtThisPos = GvcfVariant.class.cast(gvcfthing);
}
break;
} else if (gvcfthing.isBlock()) {
minEnd = Math.min(minEnd, gvcfthing.getEnd());
break;
} else {
LOG.debug("??");
}
}
}
if (variantAtThisPos == null) {
pos = minEnd + 1;
} else {
final VariantContext archetype = variantAtThisPos.ctx;
final List<VariantContext> allVariants = this.gvcfSources.stream().map(S -> S.get()).filter(G -> G != null && G.isVariant()).map(G -> GvcfVariant.class.cast(G).ctx).filter(V -> variantContigPosRefComparator.compare(V, archetype) == 0).collect(Collectors.toList());
;
final Set<Allele> alleles = allVariants.stream().flatMap(V -> V.getGenotypes().stream()).flatMap(G -> G.getAlleles().stream()).filter(A -> !(A.equals(NON_REF) || A.isNoCall())).collect(Collectors.toSet());
alleles.add(archetype.getReference());
final VariantContextBuilder vcb = new VariantContextBuilder(getClass().getName(), archetype.getContig(), archetype.getStart(), archetype.getEnd(), alleles);
if (archetype.hasID()) {
vcb.id(archetype.getID());
}
final List<Genotype> genotypes = new ArrayList<>(allVariants.size());
for (VariantContext ctx : allVariants) {
Genotype genotype = ctx.getGenotype(0);
GenotypeBuilder gb = new GenotypeBuilder(genotype);
genotypes.add(gb.make());
}
vcb.genotypes(genotypes);
final VariantContext genotypedVariant;
try {
genotypedVariant = vcb.make();
} catch (Exception err2) {
LOG.debug(allVariants);
LOG.error(err2);
return -1;
}
w.add(genotypedVariant);
// reset source for the current variant
for (final Source src : this.gvcfSources) {
if (src.current != null && variantContigPosRefComparator.compare(src.current.ctx, archetype) == 0) {
src.current = null;
}
}
}
}
++contigTid;
}
w.close();
w = null;
this.gvcfSources.stream().forEach(S -> S.close());
this.gvcfSources.clear();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
for (final Source src : this.gvcfSources) src.close();
CloserUtil.close(w);
}
}
use of htsjdk.variant.variantcontext.GenotypeBuilder in project jvarkit by lindenb.
the class VcfMultiToOne method doWork.
@Override
public int doWork(final List<String> arguments) {
VariantContextWriter out = null;
Set<String> args = IOUtils.unrollFiles(arguments);
List<VcfIterator> inputs = new ArrayList<>(args.size() + 1);
List<String> inputFiles = new ArrayList<>(args.size() + 1);
try {
if (args.isEmpty() && arguments.isEmpty()) {
inputs.add(VCFUtils.createVcfIteratorStdin());
inputFiles.add("stdin");
} else if (args.isEmpty()) {
LOG.error("No vcf provided");
return -1;
} else {
for (final String vcfFile : args) {
inputs.add(VCFUtils.createVcfIterator(vcfFile));
inputFiles.add(VCFUtils.escapeInfoField(vcfFile));
}
}
SAMSequenceDictionary dict = null;
final Set<String> sampleNames = new HashSet<String>();
final Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
for (final VcfIterator in : inputs) {
final VCFHeader header = in.getHeader();
if (dict == null) {
dict = header.getSequenceDictionary();
} else if (header.getSequenceDictionary() == null) {
LOG.error("No Dictionary in vcf");
return -1;
} else if (!SequenceUtil.areSequenceDictionariesEqual(dict, header.getSequenceDictionary())) {
LOG.error("Not the same dictionary between vcfs");
return -1;
}
metaData.addAll(in.getHeader().getMetaDataInInputOrder());
sampleNames.addAll(in.getHeader().getSampleNamesInOrder());
}
final Comparator<VariantContext> comparator = (dict == null ? VCFUtils.createChromPosRefComparator() : VCFUtils.createTidPosRefComparator(dict));
// addMetaData(metaData);
metaData.add(new VCFInfoHeaderLine(DEFAULT_SAMPLE_TAGID, 1, VCFHeaderLineType.String, "Sample Name from multi-sample vcf"));
metaData.add(new VCFInfoHeaderLine(DEFAULT_SAMPLE_FILETAGID, 1, VCFHeaderLineType.String, "Origin of sample"));
for (final String sample : sampleNames) {
metaData.add(new VCFHeaderLine(SAMPLE_HEADER_DECLARATION, sample));
}
final VCFHeader h2 = new VCFHeader(metaData, Collections.singleton(DEFAULT_VCF_SAMPLE_NAME));
recalculator.setHeader(h2);
out = super.openVariantContextWriter(this.outputFile);
out.writeHeader(h2);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
for (; ; ) {
if (out.checkError())
break;
/* get 'smallest' variant */
VariantContext smallest = null;
int idx = 0;
int best_idx = -1;
while (idx < inputs.size()) {
final VcfIterator in = inputs.get(idx);
if (!in.hasNext()) {
CloserUtil.close(in);
inputs.remove(idx);
inputFiles.remove(idx);
} else {
final VariantContext ctx = in.peek();
if (smallest == null || comparator.compare(smallest, ctx) > 0) {
smallest = ctx;
best_idx = idx;
}
++idx;
}
}
if (smallest == null)
break;
final VariantContext ctx = progress.watch(inputs.get(best_idx).next());
if (ctx.getNSamples() == 0) {
if (!this.discard_no_call) {
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
vcb.attribute(DEFAULT_SAMPLE_FILETAGID, inputFiles.get(best_idx));
vcb.genotypes(GenotypeBuilder.createMissing(DEFAULT_VCF_SAMPLE_NAME, 2));
out.add(this.recalculator.apply(vcb.make()));
}
continue;
}
for (int i = 0; i < ctx.getNSamples(); ++i) {
final Genotype g = ctx.getGenotype(i);
final String sample = g.getSampleName();
if (!g.isCalled() && this.discard_no_call)
continue;
if (!g.isAvailable() && this.discard_non_available)
continue;
if (g.isHomRef() && this.discard_hom_ref)
continue;
final GenotypeBuilder gb = new GenotypeBuilder(g);
gb.name(DEFAULT_VCF_SAMPLE_NAME);
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
vcb.attribute(DEFAULT_SAMPLE_TAGID, sample);
vcb.attribute(DEFAULT_SAMPLE_FILETAGID, inputFiles.get(best_idx));
vcb.genotypes(gb.make());
out.add(this.recalculator.apply(vcb.make()));
}
}
progress.finish();
LOG.debug("done");
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(inputs);
CloserUtil.close(out);
}
}
use of htsjdk.variant.variantcontext.GenotypeBuilder in project jvarkit by lindenb.
the class SoftClipAnnotator method map.
public Integer map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
if (tracker == null)
return 0;
final Collection<VariantContext> VCs = tracker.getValues(variantCollection.variants, context.getLocation());
if (VCs.isEmpty()) {
return 0;
}
final Collection<ReadFilter> readFilters = super.getToolkit().getFilters();
for (final VariantContext ctx : VCs) {
int observed_genotypes = 0;
int num_some_clipped_genotypes = 0;
final List<Genotype> genotypes = new ArrayList<>(ctx.getNSamples());
for (int i = 0; i < ctx.getNSamples(); ++i) {
final Genotype g = ctx.getGenotype(i);
if (!g.isCalled() || g.isNoCall() || g.isHomRef()) {
genotypes.add(g);
continue;
}
final List<SamReader> bamReaderForSample = this.sample2bam.get(g.getSampleName());
if (bamReaderForSample.isEmpty()) {
genotypes.add(g);
continue;
}
observed_genotypes++;
int numberOfClipsOverlapping = 0;
for (final SamReader samReader : bamReaderForSample) {
final SAMRecordIterator iter = samReader.query(ctx.getContig(), Math.max(0, ctx.getStart() - extend4clip), ctx.getEnd() + extend4clip, false);
while (iter.hasNext()) {
final SAMRecord samRecord = iter.next();
if (samRecord.getReadUnmappedFlag() || samRecord.getCigar() == null || !samRecord.getContig().equals(ctx.getContig()) || samRecord.getUnclippedEnd() < ctx.getStart() || samRecord.getUnclippedStart() > ctx.getEnd() || samRecord.getReadGroup() == null || !g.getSampleName().equals(samRecord.getReadGroup().getSample()))
continue;
boolean ok = true;
for (final ReadFilter readFilter : readFilters) {
// this one cannot't be correctly initialized...
if (readFilter instanceof MalformedReadFilter)
continue;
if (readFilter.filterOut(samRecord)) {
ok = false;
break;
}
}
if (!ok)
continue;
int refPos = samRecord.getUnclippedStart();
for (final CigarElement ce : samRecord.getCigar()) {
if (ce.getOperator().consumesReferenceBases() || ce.getOperator().isClipping()) {
final int refEnd = refPos + ce.getLength();
if (!(ctx.getEnd() < refPos || refEnd < ctx.getStart())) {
// System.err.println("IN "+ce+" "+ctx.getStart()+"-"+ctx.getEnd()+" : "+refPos+"-"+refPos+ce.getLength());
if (ce.getOperator().equals(CigarOperator.S)) {
++numberOfClipsOverlapping;
}
}
refPos += ce.getLength();
}
if (refPos > ctx.getEnd())
break;
}
}
iter.close();
}
if (numberOfClipsOverlapping == 0) {
genotypes.add(g);
} else {
GenotypeBuilder gb = new GenotypeBuilder(g);
gb.attribute(numClipInGenotypeFormatHeaderLine.getID(), numberOfClipsOverlapping);
genotypes.add(gb.make());
num_some_clipped_genotypes++;
}
}
if (num_some_clipped_genotypes == 0) {
vcfWriter.add(ctx);
} else {
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
vcb.genotypes(genotypes);
if (observed_genotypes == num_some_clipped_genotypes) {
vcb.filter(this.filterHaveClipInVariant.getID());
}
vcfWriter.add(vcb.make());
}
}
return VCs.size();
}
Aggregations